BBVectorize.cpp revision 87825e7970a361ce5a8bab19bc880ff7f6242ca2
1//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a basic-block vectorization pass. The algorithm was 11// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12// et al. It works by looking for chains of pairable operations and then 13// pairing them. 14// 15//===----------------------------------------------------------------------===// 16 17#define BBV_NAME "bb-vectorize" 18#define DEBUG_TYPE BBV_NAME 19#include "llvm/Constants.h" 20#include "llvm/DerivedTypes.h" 21#include "llvm/Function.h" 22#include "llvm/Instructions.h" 23#include "llvm/IntrinsicInst.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/LLVMContext.h" 26#include "llvm/Pass.h" 27#include "llvm/Type.h" 28#include "llvm/ADT/DenseMap.h" 29#include "llvm/ADT/DenseSet.h" 30#include "llvm/ADT/SmallVector.h" 31#include "llvm/ADT/Statistic.h" 32#include "llvm/ADT/STLExtras.h" 33#include "llvm/ADT/StringExtras.h" 34#include "llvm/Analysis/AliasAnalysis.h" 35#include "llvm/Analysis/AliasSetTracker.h" 36#include "llvm/Analysis/ScalarEvolution.h" 37#include "llvm/Analysis/ScalarEvolutionExpressions.h" 38#include "llvm/Analysis/ValueTracking.h" 39#include "llvm/Support/CommandLine.h" 40#include "llvm/Support/Debug.h" 41#include "llvm/Support/raw_ostream.h" 42#include "llvm/Support/ValueHandle.h" 43#include "llvm/Target/TargetData.h" 44#include "llvm/Transforms/Vectorize.h" 45#include <algorithm> 46#include <map> 47using namespace llvm; 48 49static cl::opt<unsigned> 50ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 51 cl::desc("The required chain depth for vectorization")); 52 53static cl::opt<unsigned> 54SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 55 cl::desc("The maximum search distance for instruction pairs")); 56 57static cl::opt<bool> 58SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 59 cl::desc("Replicating one element to a pair breaks the chain")); 60 61static cl::opt<unsigned> 62VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 63 cl::desc("The size of the native vector registers")); 64 65static cl::opt<unsigned> 66MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 67 cl::desc("The maximum number of pairing iterations")); 68 69static cl::opt<unsigned> 70MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 71 cl::desc("The maximum number of pairable instructions per group")); 72 73static cl::opt<unsigned> 74MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 75 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 76 " a full cycle check")); 77 78static cl::opt<bool> 79NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 80 cl::desc("Don't try to vectorize integer values")); 81 82static cl::opt<bool> 83NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 84 cl::desc("Don't try to vectorize floating-point values")); 85 86static cl::opt<bool> 87NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 88 cl::desc("Don't try to vectorize casting (conversion) operations")); 89 90static cl::opt<bool> 91NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 92 cl::desc("Don't try to vectorize floating-point math intrinsics")); 93 94static cl::opt<bool> 95NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 96 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 97 98static cl::opt<bool> 99NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 100 cl::desc("Don't try to vectorize loads and stores")); 101 102static cl::opt<bool> 103AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 104 cl::desc("Only generate aligned loads and stores")); 105 106static cl::opt<bool> 107NoMemOpBoost("bb-vectorize-no-mem-op-boost", 108 cl::init(false), cl::Hidden, 109 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 110 111static cl::opt<bool> 112FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 113 cl::desc("Use a fast instruction dependency analysis")); 114 115#ifndef NDEBUG 116static cl::opt<bool> 117DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 118 cl::init(false), cl::Hidden, 119 cl::desc("When debugging is enabled, output information on the" 120 " instruction-examination process")); 121static cl::opt<bool> 122DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 123 cl::init(false), cl::Hidden, 124 cl::desc("When debugging is enabled, output information on the" 125 " candidate-selection process")); 126static cl::opt<bool> 127DebugPairSelection("bb-vectorize-debug-pair-selection", 128 cl::init(false), cl::Hidden, 129 cl::desc("When debugging is enabled, output information on the" 130 " pair-selection process")); 131static cl::opt<bool> 132DebugCycleCheck("bb-vectorize-debug-cycle-check", 133 cl::init(false), cl::Hidden, 134 cl::desc("When debugging is enabled, output information on the" 135 " cycle-checking process")); 136#endif 137 138STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 139 140namespace { 141 struct BBVectorize : public BasicBlockPass { 142 static char ID; // Pass identification, replacement for typeid 143 BBVectorize() : BasicBlockPass(ID) { 144 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 145 } 146 147 BBVectorize(Pass *P) : BasicBlockPass(ID) { 148 AA = &P->getAnalysis<AliasAnalysis>(); 149 SE = &P->getAnalysis<ScalarEvolution>(); 150 TD = P->getAnalysisIfAvailable<TargetData>(); 151 } 152 153 typedef std::pair<Value *, Value *> ValuePair; 154 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 155 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 156 typedef std::pair<std::multimap<Value *, Value *>::iterator, 157 std::multimap<Value *, Value *>::iterator> VPIteratorPair; 158 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, 159 std::multimap<ValuePair, ValuePair>::iterator> 160 VPPIteratorPair; 161 162 AliasAnalysis *AA; 163 ScalarEvolution *SE; 164 TargetData *TD; 165 166 // FIXME: const correct? 167 168 bool vectorizePairs(BasicBlock &BB); 169 170 bool getCandidatePairs(BasicBlock &BB, 171 BasicBlock::iterator &Start, 172 std::multimap<Value *, Value *> &CandidatePairs, 173 std::vector<Value *> &PairableInsts); 174 175 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, 176 std::vector<Value *> &PairableInsts, 177 std::multimap<ValuePair, ValuePair> &ConnectedPairs); 178 179 void buildDepMap(BasicBlock &BB, 180 std::multimap<Value *, Value *> &CandidatePairs, 181 std::vector<Value *> &PairableInsts, 182 DenseSet<ValuePair> &PairableInstUsers); 183 184 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, 185 std::vector<Value *> &PairableInsts, 186 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 187 DenseSet<ValuePair> &PairableInstUsers, 188 DenseMap<Value *, Value *>& ChosenPairs); 189 190 void fuseChosenPairs(BasicBlock &BB, 191 std::vector<Value *> &PairableInsts, 192 DenseMap<Value *, Value *>& ChosenPairs); 193 194 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 195 196 bool areInstsCompatible(Instruction *I, Instruction *J, 197 bool IsSimpleLoadStore); 198 199 bool trackUsesOfI(DenseSet<Value *> &Users, 200 AliasSetTracker &WriteSet, Instruction *I, 201 Instruction *J, bool UpdateUsers = true, 202 std::multimap<Value *, Value *> *LoadMoveSet = 0); 203 204 void computePairsConnectedTo( 205 std::multimap<Value *, Value *> &CandidatePairs, 206 std::vector<Value *> &PairableInsts, 207 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 208 ValuePair P); 209 210 bool pairsConflict(ValuePair P, ValuePair Q, 211 DenseSet<ValuePair> &PairableInstUsers, 212 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); 213 214 bool pairWillFormCycle(ValuePair P, 215 std::multimap<ValuePair, ValuePair> &PairableInstUsers, 216 DenseSet<ValuePair> &CurrentPairs); 217 218 void pruneTreeFor( 219 std::multimap<Value *, Value *> &CandidatePairs, 220 std::vector<Value *> &PairableInsts, 221 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 222 DenseSet<ValuePair> &PairableInstUsers, 223 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 224 DenseMap<Value *, Value *> &ChosenPairs, 225 DenseMap<ValuePair, size_t> &Tree, 226 DenseSet<ValuePair> &PrunedTree, ValuePair J, 227 bool UseCycleCheck); 228 229 void buildInitialTreeFor( 230 std::multimap<Value *, Value *> &CandidatePairs, 231 std::vector<Value *> &PairableInsts, 232 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 233 DenseSet<ValuePair> &PairableInstUsers, 234 DenseMap<Value *, Value *> &ChosenPairs, 235 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 236 237 void findBestTreeFor( 238 std::multimap<Value *, Value *> &CandidatePairs, 239 std::vector<Value *> &PairableInsts, 240 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 241 DenseSet<ValuePair> &PairableInstUsers, 242 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 243 DenseMap<Value *, Value *> &ChosenPairs, 244 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 245 size_t &BestEffSize, VPIteratorPair ChoiceRange, 246 bool UseCycleCheck); 247 248 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 249 Instruction *J, unsigned o, bool &FlipMemInputs); 250 251 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 252 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 253 unsigned IdxOffset, std::vector<Constant*> &Mask); 254 255 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 256 Instruction *J); 257 258 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 259 Instruction *J, unsigned o, bool FlipMemInputs); 260 261 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 262 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 263 bool &FlipMemInputs); 264 265 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 266 Instruction *J, Instruction *K, 267 Instruction *&InsertionPt, Instruction *&K1, 268 Instruction *&K2, bool &FlipMemInputs); 269 270 void collectPairLoadMoveSet(BasicBlock &BB, 271 DenseMap<Value *, Value *> &ChosenPairs, 272 std::multimap<Value *, Value *> &LoadMoveSet, 273 Instruction *I); 274 275 void collectLoadMoveSet(BasicBlock &BB, 276 std::vector<Value *> &PairableInsts, 277 DenseMap<Value *, Value *> &ChosenPairs, 278 std::multimap<Value *, Value *> &LoadMoveSet); 279 280 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 281 std::multimap<Value *, Value *> &LoadMoveSet, 282 Instruction *I, Instruction *J); 283 284 void moveUsesOfIAfterJ(BasicBlock &BB, 285 std::multimap<Value *, Value *> &LoadMoveSet, 286 Instruction *&InsertionPt, 287 Instruction *I, Instruction *J); 288 289 bool vectorizeBB(BasicBlock &BB) { 290 bool changed = false; 291 // Iterate a sufficient number of times to merge types of size 1 bit, 292 // then 2 bits, then 4, etc. up to half of the target vector width of the 293 // target vector register. 294 for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); 295 v *= 2, ++n) { 296 DEBUG(dbgs() << "BBV: fusing loop #" << n << 297 " for " << BB.getName() << " in " << 298 BB.getParent()->getName() << "...\n"); 299 if (vectorizePairs(BB)) 300 changed = true; 301 else 302 break; 303 } 304 305 DEBUG(dbgs() << "BBV: done!\n"); 306 return changed; 307 } 308 309 virtual bool runOnBasicBlock(BasicBlock &BB) { 310 AA = &getAnalysis<AliasAnalysis>(); 311 SE = &getAnalysis<ScalarEvolution>(); 312 TD = getAnalysisIfAvailable<TargetData>(); 313 314 return vectorizeBB(BB); 315 } 316 317 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 318 BasicBlockPass::getAnalysisUsage(AU); 319 AU.addRequired<AliasAnalysis>(); 320 AU.addRequired<ScalarEvolution>(); 321 AU.addPreserved<AliasAnalysis>(); 322 AU.addPreserved<ScalarEvolution>(); 323 AU.setPreservesCFG(); 324 } 325 326 // This returns the vector type that holds a pair of the provided type. 327 // If the provided type is already a vector, then its length is doubled. 328 static inline VectorType *getVecTypeForPair(Type *ElemTy) { 329 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 330 unsigned numElem = VTy->getNumElements(); 331 return VectorType::get(ElemTy->getScalarType(), numElem*2); 332 } 333 334 return VectorType::get(ElemTy, 2); 335 } 336 337 // Returns the weight associated with the provided value. A chain of 338 // candidate pairs has a length given by the sum of the weights of its 339 // members (one weight per pair; the weight of each member of the pair 340 // is assumed to be the same). This length is then compared to the 341 // chain-length threshold to determine if a given chain is significant 342 // enough to be vectorized. The length is also used in comparing 343 // candidate chains where longer chains are considered to be better. 344 // Note: when this function returns 0, the resulting instructions are 345 // not actually fused. 346 static inline size_t getDepthFactor(Value *V) { 347 // InsertElement and ExtractElement have a depth factor of zero. This is 348 // for two reasons: First, they cannot be usefully fused. Second, because 349 // the pass generates a lot of these, they can confuse the simple metric 350 // used to compare the trees in the next iteration. Thus, giving them a 351 // weight of zero allows the pass to essentially ignore them in 352 // subsequent iterations when looking for vectorization opportunities 353 // while still tracking dependency chains that flow through those 354 // instructions. 355 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 356 return 0; 357 358 // Give a load or store half of the required depth so that load/store 359 // pairs will vectorize. 360 if (!NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 361 return ReqChainDepth/2; 362 363 return 1; 364 } 365 366 // This determines the relative offset of two loads or stores, returning 367 // true if the offset could be determined to be some constant value. 368 // For example, if OffsetInElmts == 1, then J accesses the memory directly 369 // after I; if OffsetInElmts == -1 then I accesses the memory 370 // directly after J. This function assumes that both instructions 371 // have the same type. 372 bool getPairPtrInfo(Instruction *I, Instruction *J, 373 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 374 int64_t &OffsetInElmts) { 375 OffsetInElmts = 0; 376 if (isa<LoadInst>(I)) { 377 IPtr = cast<LoadInst>(I)->getPointerOperand(); 378 JPtr = cast<LoadInst>(J)->getPointerOperand(); 379 IAlignment = cast<LoadInst>(I)->getAlignment(); 380 JAlignment = cast<LoadInst>(J)->getAlignment(); 381 } else { 382 IPtr = cast<StoreInst>(I)->getPointerOperand(); 383 JPtr = cast<StoreInst>(J)->getPointerOperand(); 384 IAlignment = cast<StoreInst>(I)->getAlignment(); 385 JAlignment = cast<StoreInst>(J)->getAlignment(); 386 } 387 388 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 389 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 390 391 // If this is a trivial offset, then we'll get something like 392 // 1*sizeof(type). With target data, which we need anyway, this will get 393 // constant folded into a number. 394 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 395 if (const SCEVConstant *ConstOffSCEV = 396 dyn_cast<SCEVConstant>(OffsetSCEV)) { 397 ConstantInt *IntOff = ConstOffSCEV->getValue(); 398 int64_t Offset = IntOff->getSExtValue(); 399 400 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 401 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 402 403 assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); 404 405 OffsetInElmts = Offset/VTyTSS; 406 return (abs64(Offset) % VTyTSS) == 0; 407 } 408 409 return false; 410 } 411 412 // Returns true if the provided CallInst represents an intrinsic that can 413 // be vectorized. 414 bool isVectorizableIntrinsic(CallInst* I) { 415 Function *F = I->getCalledFunction(); 416 if (!F) return false; 417 418 unsigned IID = F->getIntrinsicID(); 419 if (!IID) return false; 420 421 switch(IID) { 422 default: 423 return false; 424 case Intrinsic::sqrt: 425 case Intrinsic::powi: 426 case Intrinsic::sin: 427 case Intrinsic::cos: 428 case Intrinsic::log: 429 case Intrinsic::log2: 430 case Intrinsic::log10: 431 case Intrinsic::exp: 432 case Intrinsic::exp2: 433 case Intrinsic::pow: 434 return !NoMath; 435 case Intrinsic::fma: 436 return !NoFMA; 437 } 438 } 439 440 // Returns true if J is the second element in some pair referenced by 441 // some multimap pair iterator pair. 442 template <typename V> 443 bool isSecondInIteratorPair(V J, std::pair< 444 typename std::multimap<V, V>::iterator, 445 typename std::multimap<V, V>::iterator> PairRange) { 446 for (typename std::multimap<V, V>::iterator K = PairRange.first; 447 K != PairRange.second; ++K) 448 if (K->second == J) return true; 449 450 return false; 451 } 452 }; 453 454 // This function implements one vectorization iteration on the provided 455 // basic block. It returns true if the block is changed. 456 bool BBVectorize::vectorizePairs(BasicBlock &BB) { 457 bool ShouldContinue; 458 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 459 460 std::vector<Value *> AllPairableInsts; 461 DenseMap<Value *, Value *> AllChosenPairs; 462 463 do { 464 std::vector<Value *> PairableInsts; 465 std::multimap<Value *, Value *> CandidatePairs; 466 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 467 PairableInsts); 468 if (PairableInsts.empty()) continue; 469 470 // Now we have a map of all of the pairable instructions and we need to 471 // select the best possible pairing. A good pairing is one such that the 472 // users of the pair are also paired. This defines a (directed) forest 473 // over the pairs such that two pairs are connected iff the second pair 474 // uses the first. 475 476 // Note that it only matters that both members of the second pair use some 477 // element of the first pair (to allow for splatting). 478 479 std::multimap<ValuePair, ValuePair> ConnectedPairs; 480 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); 481 if (ConnectedPairs.empty()) continue; 482 483 // Build the pairable-instruction dependency map 484 DenseSet<ValuePair> PairableInstUsers; 485 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 486 487 // There is now a graph of the connected pairs. For each variable, pick 488 // the pairing with the largest tree meeting the depth requirement on at 489 // least one branch. Then select all pairings that are part of that tree 490 // and remove them from the list of available pairings and pairable 491 // variables. 492 493 DenseMap<Value *, Value *> ChosenPairs; 494 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, 495 PairableInstUsers, ChosenPairs); 496 497 if (ChosenPairs.empty()) continue; 498 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 499 PairableInsts.end()); 500 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 501 } while (ShouldContinue); 502 503 if (AllChosenPairs.empty()) return false; 504 NumFusedOps += AllChosenPairs.size(); 505 506 // A set of pairs has now been selected. It is now necessary to replace the 507 // paired instructions with vector instructions. For this procedure each 508 // operand must be replaced with a vector operand. This vector is formed 509 // by using build_vector on the old operands. The replaced values are then 510 // replaced with a vector_extract on the result. Subsequent optimization 511 // passes should coalesce the build/extract combinations. 512 513 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); 514 return true; 515 } 516 517 // This function returns true if the provided instruction is capable of being 518 // fused into a vector instruction. This determination is based only on the 519 // type and other attributes of the instruction. 520 bool BBVectorize::isInstVectorizable(Instruction *I, 521 bool &IsSimpleLoadStore) { 522 IsSimpleLoadStore = false; 523 524 if (CallInst *C = dyn_cast<CallInst>(I)) { 525 if (!isVectorizableIntrinsic(C)) 526 return false; 527 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 528 // Vectorize simple loads if possbile: 529 IsSimpleLoadStore = L->isSimple(); 530 if (!IsSimpleLoadStore || NoMemOps) 531 return false; 532 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 533 // Vectorize simple stores if possbile: 534 IsSimpleLoadStore = S->isSimple(); 535 if (!IsSimpleLoadStore || NoMemOps) 536 return false; 537 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 538 // We can vectorize casts, but not casts of pointer types, etc. 539 if (NoCasts) 540 return false; 541 542 Type *SrcTy = C->getSrcTy(); 543 if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) 544 return false; 545 546 Type *DestTy = C->getDestTy(); 547 if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) 548 return false; 549 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 550 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 551 return false; 552 } 553 554 // We can't vectorize memory operations without target data 555 if (TD == 0 && IsSimpleLoadStore) 556 return false; 557 558 Type *T1, *T2; 559 if (isa<StoreInst>(I)) { 560 // For stores, it is the value type, not the pointer type that matters 561 // because the value is what will come from a vector register. 562 563 Value *IVal = cast<StoreInst>(I)->getValueOperand(); 564 T1 = IVal->getType(); 565 } else { 566 T1 = I->getType(); 567 } 568 569 if (I->isCast()) 570 T2 = cast<CastInst>(I)->getSrcTy(); 571 else 572 T2 = T1; 573 574 // Not every type can be vectorized... 575 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 576 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 577 return false; 578 579 if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) 580 return false; 581 582 if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 583 return false; 584 585 if (T1->getPrimitiveSizeInBits() > VectorBits/2 || 586 T2->getPrimitiveSizeInBits() > VectorBits/2) 587 return false; 588 589 return true; 590 } 591 592 // This function returns true if the two provided instructions are compatible 593 // (meaning that they can be fused into a vector instruction). This assumes 594 // that I has already been determined to be vectorizable and that J is not 595 // in the use tree of I. 596 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 597 bool IsSimpleLoadStore) { 598 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 599 " <-> " << *J << "\n"); 600 601 // Loads and stores can be merged if they have different alignments, 602 // but are otherwise the same. 603 LoadInst *LI, *LJ; 604 StoreInst *SI, *SJ; 605 if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { 606 if (I->getType() != J->getType()) 607 return false; 608 609 if (LI->getPointerOperand()->getType() != 610 LJ->getPointerOperand()->getType() || 611 LI->isVolatile() != LJ->isVolatile() || 612 LI->getOrdering() != LJ->getOrdering() || 613 LI->getSynchScope() != LJ->getSynchScope()) 614 return false; 615 } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { 616 if (SI->getValueOperand()->getType() != 617 SJ->getValueOperand()->getType() || 618 SI->getPointerOperand()->getType() != 619 SJ->getPointerOperand()->getType() || 620 SI->isVolatile() != SJ->isVolatile() || 621 SI->getOrdering() != SJ->getOrdering() || 622 SI->getSynchScope() != SJ->getSynchScope()) 623 return false; 624 } else if (!J->isSameOperationAs(I)) { 625 return false; 626 } 627 // FIXME: handle addsub-type operations! 628 629 if (IsSimpleLoadStore) { 630 Value *IPtr, *JPtr; 631 unsigned IAlignment, JAlignment; 632 int64_t OffsetInElmts = 0; 633 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 634 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 635 if (AlignedOnly) { 636 Type *aType = isa<StoreInst>(I) ? 637 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 638 // An aligned load or store is possible only if the instruction 639 // with the lower offset has an alignment suitable for the 640 // vector type. 641 642 unsigned BottomAlignment = IAlignment; 643 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 644 645 Type *VType = getVecTypeForPair(aType); 646 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 647 if (BottomAlignment < VecAlignment) 648 return false; 649 } 650 } else { 651 return false; 652 } 653 } else if (isa<ShuffleVectorInst>(I)) { 654 // Only merge two shuffles if they're both constant 655 return isa<Constant>(I->getOperand(2)) && 656 isa<Constant>(J->getOperand(2)); 657 // FIXME: We may want to vectorize non-constant shuffles also. 658 } 659 660 // The powi intrinsic is special because only the first argument is 661 // vectorized, the second arguments must be equal. 662 CallInst *CI = dyn_cast<CallInst>(I); 663 Function *FI; 664 if (CI && (FI = CI->getCalledFunction()) && 665 FI->getIntrinsicID() == Intrinsic::powi) { 666 667 Value *A1I = CI->getArgOperand(1), 668 *A1J = cast<CallInst>(J)->getArgOperand(1); 669 const SCEV *A1ISCEV = SE->getSCEV(A1I), 670 *A1JSCEV = SE->getSCEV(A1J); 671 return (A1ISCEV == A1JSCEV); 672 } 673 674 return true; 675 } 676 677 // Figure out whether or not J uses I and update the users and write-set 678 // structures associated with I. Specifically, Users represents the set of 679 // instructions that depend on I. WriteSet represents the set 680 // of memory locations that are dependent on I. If UpdateUsers is true, 681 // and J uses I, then Users is updated to contain J and WriteSet is updated 682 // to contain any memory locations to which J writes. The function returns 683 // true if J uses I. By default, alias analysis is used to determine 684 // whether J reads from memory that overlaps with a location in WriteSet. 685 // If LoadMoveSet is not null, then it is a previously-computed multimap 686 // where the key is the memory-based user instruction and the value is 687 // the instruction to be compared with I. So, if LoadMoveSet is provided, 688 // then the alias analysis is not used. This is necessary because this 689 // function is called during the process of moving instructions during 690 // vectorization and the results of the alias analysis are not stable during 691 // that process. 692 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 693 AliasSetTracker &WriteSet, Instruction *I, 694 Instruction *J, bool UpdateUsers, 695 std::multimap<Value *, Value *> *LoadMoveSet) { 696 bool UsesI = false; 697 698 // This instruction may already be marked as a user due, for example, to 699 // being a member of a selected pair. 700 if (Users.count(J)) 701 UsesI = true; 702 703 if (!UsesI) 704 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 705 JU != JE; ++JU) { 706 Value *V = *JU; 707 if (I == V || Users.count(V)) { 708 UsesI = true; 709 break; 710 } 711 } 712 if (!UsesI && J->mayReadFromMemory()) { 713 if (LoadMoveSet) { 714 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); 715 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); 716 } else { 717 for (AliasSetTracker::iterator W = WriteSet.begin(), 718 WE = WriteSet.end(); W != WE; ++W) { 719 if (W->aliasesUnknownInst(J, *AA)) { 720 UsesI = true; 721 break; 722 } 723 } 724 } 725 } 726 727 if (UsesI && UpdateUsers) { 728 if (J->mayWriteToMemory()) WriteSet.add(J); 729 Users.insert(J); 730 } 731 732 return UsesI; 733 } 734 735 // This function iterates over all instruction pairs in the provided 736 // basic block and collects all candidate pairs for vectorization. 737 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 738 BasicBlock::iterator &Start, 739 std::multimap<Value *, Value *> &CandidatePairs, 740 std::vector<Value *> &PairableInsts) { 741 BasicBlock::iterator E = BB.end(); 742 if (Start == E) return false; 743 744 bool ShouldContinue = false, IAfterStart = false; 745 for (BasicBlock::iterator I = Start++; I != E; ++I) { 746 if (I == Start) IAfterStart = true; 747 748 bool IsSimpleLoadStore; 749 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 750 751 // Look for an instruction with which to pair instruction *I... 752 DenseSet<Value *> Users; 753 AliasSetTracker WriteSet(*AA); 754 bool JAfterStart = IAfterStart; 755 BasicBlock::iterator J = llvm::next(I); 756 for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) { 757 if (J == Start) JAfterStart = true; 758 759 // Determine if J uses I, if so, exit the loop. 760 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep); 761 if (FastDep) { 762 // Note: For this heuristic to be effective, independent operations 763 // must tend to be intermixed. This is likely to be true from some 764 // kinds of grouped loop unrolling (but not the generic LLVM pass), 765 // but otherwise may require some kind of reordering pass. 766 767 // When using fast dependency analysis, 768 // stop searching after first use: 769 if (UsesI) break; 770 } else { 771 if (UsesI) continue; 772 } 773 774 // J does not use I, and comes before the first use of I, so it can be 775 // merged with I if the instructions are compatible. 776 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; 777 778 // J is a candidate for merging with I. 779 if (!PairableInsts.size() || 780 PairableInsts[PairableInsts.size()-1] != I) { 781 PairableInsts.push_back(I); 782 } 783 784 CandidatePairs.insert(ValuePair(I, J)); 785 786 // The next call to this function must start after the last instruction 787 // selected during this invocation. 788 if (JAfterStart) { 789 Start = llvm::next(J); 790 IAfterStart = JAfterStart = false; 791 } 792 793 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 794 << *I << " <-> " << *J << "\n"); 795 796 // If we have already found too many pairs, break here and this function 797 // will be called again starting after the last instruction selected 798 // during this invocation. 799 if (PairableInsts.size() >= MaxInsts) { 800 ShouldContinue = true; 801 break; 802 } 803 } 804 805 if (ShouldContinue) 806 break; 807 } 808 809 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 810 << " instructions with candidate pairs\n"); 811 812 return ShouldContinue; 813 } 814 815 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 816 // it looks for pairs such that both members have an input which is an 817 // output of PI or PJ. 818 void BBVectorize::computePairsConnectedTo( 819 std::multimap<Value *, Value *> &CandidatePairs, 820 std::vector<Value *> &PairableInsts, 821 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 822 ValuePair P) { 823 // For each possible pairing for this variable, look at the uses of 824 // the first value... 825 for (Value::use_iterator I = P.first->use_begin(), 826 E = P.first->use_end(); I != E; ++I) { 827 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 828 829 // For each use of the first variable, look for uses of the second 830 // variable... 831 for (Value::use_iterator J = P.second->use_begin(), 832 E2 = P.second->use_end(); J != E2; ++J) { 833 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); 834 835 // Look for <I, J>: 836 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 837 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 838 839 // Look for <J, I>: 840 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) 841 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); 842 } 843 844 if (SplatBreaksChain) continue; 845 // Look for cases where just the first value in the pair is used by 846 // both members of another pair (splatting). 847 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 848 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 849 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 850 } 851 } 852 853 if (SplatBreaksChain) return; 854 // Look for cases where just the second value in the pair is used by 855 // both members of another pair (splatting). 856 for (Value::use_iterator I = P.second->use_begin(), 857 E = P.second->use_end(); I != E; ++I) { 858 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 859 860 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 861 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 862 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 863 } 864 } 865 } 866 867 // This function figures out which pairs are connected. Two pairs are 868 // connected if some output of the first pair forms an input to both members 869 // of the second pair. 870 void BBVectorize::computeConnectedPairs( 871 std::multimap<Value *, Value *> &CandidatePairs, 872 std::vector<Value *> &PairableInsts, 873 std::multimap<ValuePair, ValuePair> &ConnectedPairs) { 874 875 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 876 PE = PairableInsts.end(); PI != PE; ++PI) { 877 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); 878 879 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; 880 P != choiceRange.second; ++P) 881 computePairsConnectedTo(CandidatePairs, PairableInsts, 882 ConnectedPairs, *P); 883 } 884 885 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() 886 << " pair connections.\n"); 887 } 888 889 // This function builds a set of use tuples such that <A, B> is in the set 890 // if B is in the use tree of A. If B is in the use tree of A, then B 891 // depends on the output of A. 892 void BBVectorize::buildDepMap( 893 BasicBlock &BB, 894 std::multimap<Value *, Value *> &CandidatePairs, 895 std::vector<Value *> &PairableInsts, 896 DenseSet<ValuePair> &PairableInstUsers) { 897 DenseSet<Value *> IsInPair; 898 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), 899 E = CandidatePairs.end(); C != E; ++C) { 900 IsInPair.insert(C->first); 901 IsInPair.insert(C->second); 902 } 903 904 // Iterate through the basic block, recording all Users of each 905 // pairable instruction. 906 907 BasicBlock::iterator E = BB.end(); 908 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 909 if (IsInPair.find(I) == IsInPair.end()) continue; 910 911 DenseSet<Value *> Users; 912 AliasSetTracker WriteSet(*AA); 913 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) 914 (void) trackUsesOfI(Users, WriteSet, I, J); 915 916 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 917 U != E; ++U) 918 PairableInstUsers.insert(ValuePair(I, *U)); 919 } 920 } 921 922 // Returns true if an input to pair P is an output of pair Q and also an 923 // input of pair Q is an output of pair P. If this is the case, then these 924 // two pairs cannot be simultaneously fused. 925 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 926 DenseSet<ValuePair> &PairableInstUsers, 927 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { 928 // Two pairs are in conflict if they are mutual Users of eachother. 929 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 930 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 931 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 932 PairableInstUsers.count(ValuePair(P.second, Q.second)); 933 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 934 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 935 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 936 PairableInstUsers.count(ValuePair(Q.second, P.second)); 937 if (PairableInstUserMap) { 938 // FIXME: The expensive part of the cycle check is not so much the cycle 939 // check itself but this edge insertion procedure. This needs some 940 // profiling and probably a different data structure (same is true of 941 // most uses of std::multimap). 942 if (PUsesQ) { 943 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); 944 if (!isSecondInIteratorPair(P, QPairRange)) 945 PairableInstUserMap->insert(VPPair(Q, P)); 946 } 947 if (QUsesP) { 948 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); 949 if (!isSecondInIteratorPair(Q, PPairRange)) 950 PairableInstUserMap->insert(VPPair(P, Q)); 951 } 952 } 953 954 return (QUsesP && PUsesQ); 955 } 956 957 // This function walks the use graph of current pairs to see if, starting 958 // from P, the walk returns to P. 959 bool BBVectorize::pairWillFormCycle(ValuePair P, 960 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 961 DenseSet<ValuePair> &CurrentPairs) { 962 DEBUG(if (DebugCycleCheck) 963 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 964 << *P.second << "\n"); 965 // A lookup table of visisted pairs is kept because the PairableInstUserMap 966 // contains non-direct associations. 967 DenseSet<ValuePair> Visited; 968 SmallVector<ValuePair, 32> Q; 969 // General depth-first post-order traversal: 970 Q.push_back(P); 971 do { 972 ValuePair QTop = Q.pop_back_val(); 973 Visited.insert(QTop); 974 975 DEBUG(if (DebugCycleCheck) 976 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 977 << *QTop.second << "\n"); 978 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); 979 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; 980 C != QPairRange.second; ++C) { 981 if (C->second == P) { 982 DEBUG(dbgs() 983 << "BBV: rejected to prevent non-trivial cycle formation: " 984 << *C->first.first << " <-> " << *C->first.second << "\n"); 985 return true; 986 } 987 988 if (CurrentPairs.count(C->second) && !Visited.count(C->second)) 989 Q.push_back(C->second); 990 } 991 } while (!Q.empty()); 992 993 return false; 994 } 995 996 // This function builds the initial tree of connected pairs with the 997 // pair J at the root. 998 void BBVectorize::buildInitialTreeFor( 999 std::multimap<Value *, Value *> &CandidatePairs, 1000 std::vector<Value *> &PairableInsts, 1001 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1002 DenseSet<ValuePair> &PairableInstUsers, 1003 DenseMap<Value *, Value *> &ChosenPairs, 1004 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1005 // Each of these pairs is viewed as the root node of a Tree. The Tree 1006 // is then walked (depth-first). As this happens, we keep track of 1007 // the pairs that compose the Tree and the maximum depth of the Tree. 1008 SmallVector<ValuePairWithDepth, 32> Q; 1009 // General depth-first post-order traversal: 1010 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1011 do { 1012 ValuePairWithDepth QTop = Q.back(); 1013 1014 // Push each child onto the queue: 1015 bool MoreChildren = false; 1016 size_t MaxChildDepth = QTop.second; 1017 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); 1018 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; 1019 k != qtRange.second; ++k) { 1020 // Make sure that this child pair is still a candidate: 1021 bool IsStillCand = false; 1022 VPIteratorPair checkRange = 1023 CandidatePairs.equal_range(k->second.first); 1024 for (std::multimap<Value *, Value *>::iterator m = checkRange.first; 1025 m != checkRange.second; ++m) { 1026 if (m->second == k->second.second) { 1027 IsStillCand = true; 1028 break; 1029 } 1030 } 1031 1032 if (IsStillCand) { 1033 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); 1034 if (C == Tree.end()) { 1035 size_t d = getDepthFactor(k->second.first); 1036 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); 1037 MoreChildren = true; 1038 } else { 1039 MaxChildDepth = std::max(MaxChildDepth, C->second); 1040 } 1041 } 1042 } 1043 1044 if (!MoreChildren) { 1045 // Record the current pair as part of the Tree: 1046 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1047 Q.pop_back(); 1048 } 1049 } while (!Q.empty()); 1050 } 1051 1052 // Given some initial tree, prune it by removing conflicting pairs (pairs 1053 // that cannot be simultaneously chosen for vectorization). 1054 void BBVectorize::pruneTreeFor( 1055 std::multimap<Value *, Value *> &CandidatePairs, 1056 std::vector<Value *> &PairableInsts, 1057 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1058 DenseSet<ValuePair> &PairableInstUsers, 1059 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1060 DenseMap<Value *, Value *> &ChosenPairs, 1061 DenseMap<ValuePair, size_t> &Tree, 1062 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1063 bool UseCycleCheck) { 1064 SmallVector<ValuePairWithDepth, 32> Q; 1065 // General depth-first post-order traversal: 1066 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1067 do { 1068 ValuePairWithDepth QTop = Q.pop_back_val(); 1069 PrunedTree.insert(QTop.first); 1070 1071 // Visit each child, pruning as necessary... 1072 DenseMap<ValuePair, size_t> BestChildren; 1073 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); 1074 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; 1075 K != QTopRange.second; ++K) { 1076 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); 1077 if (C == Tree.end()) continue; 1078 1079 // This child is in the Tree, now we need to make sure it is the 1080 // best of any conflicting children. There could be multiple 1081 // conflicting children, so first, determine if we're keeping 1082 // this child, then delete conflicting children as necessary. 1083 1084 // It is also necessary to guard against pairing-induced 1085 // dependencies. Consider instructions a .. x .. y .. b 1086 // such that (a,b) are to be fused and (x,y) are to be fused 1087 // but a is an input to x and b is an output from y. This 1088 // means that y cannot be moved after b but x must be moved 1089 // after b for (a,b) to be fused. In other words, after 1090 // fusing (a,b) we have y .. a/b .. x where y is an input 1091 // to a/b and x is an output to a/b: x and y can no longer 1092 // be legally fused. To prevent this condition, we must 1093 // make sure that a child pair added to the Tree is not 1094 // both an input and output of an already-selected pair. 1095 1096 // Pairing-induced dependencies can also form from more complicated 1097 // cycles. The pair vs. pair conflicts are easy to check, and so 1098 // that is done explicitly for "fast rejection", and because for 1099 // child vs. child conflicts, we may prefer to keep the current 1100 // pair in preference to the already-selected child. 1101 DenseSet<ValuePair> CurrentPairs; 1102 1103 bool CanAdd = true; 1104 for (DenseMap<ValuePair, size_t>::iterator C2 1105 = BestChildren.begin(), E2 = BestChildren.end(); 1106 C2 != E2; ++C2) { 1107 if (C2->first.first == C->first.first || 1108 C2->first.first == C->first.second || 1109 C2->first.second == C->first.first || 1110 C2->first.second == C->first.second || 1111 pairsConflict(C2->first, C->first, PairableInstUsers, 1112 UseCycleCheck ? &PairableInstUserMap : 0)) { 1113 if (C2->second >= C->second) { 1114 CanAdd = false; 1115 break; 1116 } 1117 1118 CurrentPairs.insert(C2->first); 1119 } 1120 } 1121 if (!CanAdd) continue; 1122 1123 // Even worse, this child could conflict with another node already 1124 // selected for the Tree. If that is the case, ignore this child. 1125 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1126 E2 = PrunedTree.end(); T != E2; ++T) { 1127 if (T->first == C->first.first || 1128 T->first == C->first.second || 1129 T->second == C->first.first || 1130 T->second == C->first.second || 1131 pairsConflict(*T, C->first, PairableInstUsers, 1132 UseCycleCheck ? &PairableInstUserMap : 0)) { 1133 CanAdd = false; 1134 break; 1135 } 1136 1137 CurrentPairs.insert(*T); 1138 } 1139 if (!CanAdd) continue; 1140 1141 // And check the queue too... 1142 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1143 E2 = Q.end(); C2 != E2; ++C2) { 1144 if (C2->first.first == C->first.first || 1145 C2->first.first == C->first.second || 1146 C2->first.second == C->first.first || 1147 C2->first.second == C->first.second || 1148 pairsConflict(C2->first, C->first, PairableInstUsers, 1149 UseCycleCheck ? &PairableInstUserMap : 0)) { 1150 CanAdd = false; 1151 break; 1152 } 1153 1154 CurrentPairs.insert(C2->first); 1155 } 1156 if (!CanAdd) continue; 1157 1158 // Last but not least, check for a conflict with any of the 1159 // already-chosen pairs. 1160 for (DenseMap<Value *, Value *>::iterator C2 = 1161 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1162 C2 != E2; ++C2) { 1163 if (pairsConflict(*C2, C->first, PairableInstUsers, 1164 UseCycleCheck ? &PairableInstUserMap : 0)) { 1165 CanAdd = false; 1166 break; 1167 } 1168 1169 CurrentPairs.insert(*C2); 1170 } 1171 if (!CanAdd) continue; 1172 1173 // To check for non-trivial cycles formed by the addition of the 1174 // current pair we've formed a list of all relevant pairs, now use a 1175 // graph walk to check for a cycle. We start from the current pair and 1176 // walk the use tree to see if we again reach the current pair. If we 1177 // do, then the current pair is rejected. 1178 1179 // FIXME: It may be more efficient to use a topological-ordering 1180 // algorithm to improve the cycle check. This should be investigated. 1181 if (UseCycleCheck && 1182 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1183 continue; 1184 1185 // This child can be added, but we may have chosen it in preference 1186 // to an already-selected child. Check for this here, and if a 1187 // conflict is found, then remove the previously-selected child 1188 // before adding this one in its place. 1189 for (DenseMap<ValuePair, size_t>::iterator C2 1190 = BestChildren.begin(); C2 != BestChildren.end();) { 1191 if (C2->first.first == C->first.first || 1192 C2->first.first == C->first.second || 1193 C2->first.second == C->first.first || 1194 C2->first.second == C->first.second || 1195 pairsConflict(C2->first, C->first, PairableInstUsers)) 1196 BestChildren.erase(C2++); 1197 else 1198 ++C2; 1199 } 1200 1201 BestChildren.insert(ValuePairWithDepth(C->first, C->second)); 1202 } 1203 1204 for (DenseMap<ValuePair, size_t>::iterator C 1205 = BestChildren.begin(), E2 = BestChildren.end(); 1206 C != E2; ++C) { 1207 size_t DepthF = getDepthFactor(C->first.first); 1208 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1209 } 1210 } while (!Q.empty()); 1211 } 1212 1213 // This function finds the best tree of mututally-compatible connected 1214 // pairs, given the choice of root pairs as an iterator range. 1215 void BBVectorize::findBestTreeFor( 1216 std::multimap<Value *, Value *> &CandidatePairs, 1217 std::vector<Value *> &PairableInsts, 1218 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1219 DenseSet<ValuePair> &PairableInstUsers, 1220 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1221 DenseMap<Value *, Value *> &ChosenPairs, 1222 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1223 size_t &BestEffSize, VPIteratorPair ChoiceRange, 1224 bool UseCycleCheck) { 1225 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; 1226 J != ChoiceRange.second; ++J) { 1227 1228 // Before going any further, make sure that this pair does not 1229 // conflict with any already-selected pairs (see comment below 1230 // near the Tree pruning for more details). 1231 DenseSet<ValuePair> ChosenPairSet; 1232 bool DoesConflict = false; 1233 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1234 E = ChosenPairs.end(); C != E; ++C) { 1235 if (pairsConflict(*C, *J, PairableInstUsers, 1236 UseCycleCheck ? &PairableInstUserMap : 0)) { 1237 DoesConflict = true; 1238 break; 1239 } 1240 1241 ChosenPairSet.insert(*C); 1242 } 1243 if (DoesConflict) continue; 1244 1245 if (UseCycleCheck && 1246 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) 1247 continue; 1248 1249 DenseMap<ValuePair, size_t> Tree; 1250 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1251 PairableInstUsers, ChosenPairs, Tree, *J); 1252 1253 // Because we'll keep the child with the largest depth, the largest 1254 // depth is still the same in the unpruned Tree. 1255 size_t MaxDepth = Tree.lookup(*J); 1256 1257 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1258 << *J->first << " <-> " << *J->second << "} of depth " << 1259 MaxDepth << " and size " << Tree.size() << "\n"); 1260 1261 // At this point the Tree has been constructed, but, may contain 1262 // contradictory children (meaning that different children of 1263 // some tree node may be attempting to fuse the same instruction). 1264 // So now we walk the tree again, in the case of a conflict, 1265 // keep only the child with the largest depth. To break a tie, 1266 // favor the first child. 1267 1268 DenseSet<ValuePair> PrunedTree; 1269 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1270 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, 1271 PrunedTree, *J, UseCycleCheck); 1272 1273 size_t EffSize = 0; 1274 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1275 E = PrunedTree.end(); S != E; ++S) 1276 EffSize += getDepthFactor(S->first); 1277 1278 DEBUG(if (DebugPairSelection) 1279 dbgs() << "BBV: found pruned Tree for pair {" 1280 << *J->first << " <-> " << *J->second << "} of depth " << 1281 MaxDepth << " and size " << PrunedTree.size() << 1282 " (effective size: " << EffSize << ")\n"); 1283 if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) { 1284 BestMaxDepth = MaxDepth; 1285 BestEffSize = EffSize; 1286 BestTree = PrunedTree; 1287 } 1288 } 1289 } 1290 1291 // Given the list of candidate pairs, this function selects those 1292 // that will be fused into vector instructions. 1293 void BBVectorize::choosePairs( 1294 std::multimap<Value *, Value *> &CandidatePairs, 1295 std::vector<Value *> &PairableInsts, 1296 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1297 DenseSet<ValuePair> &PairableInstUsers, 1298 DenseMap<Value *, Value *>& ChosenPairs) { 1299 bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck; 1300 std::multimap<ValuePair, ValuePair> PairableInstUserMap; 1301 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 1302 E = PairableInsts.end(); I != E; ++I) { 1303 // The number of possible pairings for this variable: 1304 size_t NumChoices = CandidatePairs.count(*I); 1305 if (!NumChoices) continue; 1306 1307 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); 1308 1309 // The best pair to choose and its tree: 1310 size_t BestMaxDepth = 0, BestEffSize = 0; 1311 DenseSet<ValuePair> BestTree; 1312 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1313 PairableInstUsers, PairableInstUserMap, ChosenPairs, 1314 BestTree, BestMaxDepth, BestEffSize, ChoiceRange, 1315 UseCycleCheck); 1316 1317 // A tree has been chosen (or not) at this point. If no tree was 1318 // chosen, then this instruction, I, cannot be paired (and is no longer 1319 // considered). 1320 1321 DEBUG(if (BestTree.size() > 0) 1322 dbgs() << "BBV: selected pairs in the best tree for: " 1323 << *cast<Instruction>(*I) << "\n"); 1324 1325 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 1326 SE2 = BestTree.end(); S != SE2; ++S) { 1327 // Insert the members of this tree into the list of chosen pairs. 1328 ChosenPairs.insert(ValuePair(S->first, S->second)); 1329 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 1330 *S->second << "\n"); 1331 1332 // Remove all candidate pairs that have values in the chosen tree. 1333 for (std::multimap<Value *, Value *>::iterator K = 1334 CandidatePairs.begin(); K != CandidatePairs.end();) { 1335 if (K->first == S->first || K->second == S->first || 1336 K->second == S->second || K->first == S->second) { 1337 // Don't remove the actual pair chosen so that it can be used 1338 // in subsequent tree selections. 1339 if (!(K->first == S->first && K->second == S->second)) 1340 CandidatePairs.erase(K++); 1341 else 1342 ++K; 1343 } else { 1344 ++K; 1345 } 1346 } 1347 } 1348 } 1349 1350 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 1351 } 1352 1353 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 1354 unsigned n = 0) { 1355 if (!I->hasName()) 1356 return ""; 1357 1358 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 1359 (n > 0 ? "." + utostr(n) : "")).str(); 1360 } 1361 1362 // Returns the value that is to be used as the pointer input to the vector 1363 // instruction that fuses I with J. 1364 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 1365 Instruction *I, Instruction *J, unsigned o, 1366 bool &FlipMemInputs) { 1367 Value *IPtr, *JPtr; 1368 unsigned IAlignment, JAlignment; 1369 int64_t OffsetInElmts; 1370 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 1371 OffsetInElmts); 1372 1373 // The pointer value is taken to be the one with the lowest offset. 1374 Value *VPtr; 1375 if (OffsetInElmts > 0) { 1376 VPtr = IPtr; 1377 } else { 1378 FlipMemInputs = true; 1379 VPtr = JPtr; 1380 } 1381 1382 Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); 1383 Type *VArgType = getVecTypeForPair(ArgType); 1384 Type *VArgPtrType = PointerType::get(VArgType, 1385 cast<PointerType>(IPtr->getType())->getAddressSpace()); 1386 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 1387 /* insert before */ FlipMemInputs ? J : I); 1388 } 1389 1390 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 1391 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 1392 unsigned IdxOffset, std::vector<Constant*> &Mask) { 1393 for (unsigned v = 0; v < NumElem/2; ++v) { 1394 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 1395 if (m < 0) { 1396 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 1397 } else { 1398 unsigned mm = m + (int) IdxOffset; 1399 if (m >= (int) NumInElem) 1400 mm += (int) NumInElem; 1401 1402 Mask[v+MaskOffset] = 1403 ConstantInt::get(Type::getInt32Ty(Context), mm); 1404 } 1405 } 1406 } 1407 1408 // Returns the value that is to be used as the vector-shuffle mask to the 1409 // vector instruction that fuses I with J. 1410 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 1411 Instruction *I, Instruction *J) { 1412 // This is the shuffle mask. We need to append the second 1413 // mask to the first, and the numbers need to be adjusted. 1414 1415 Type *ArgType = I->getType(); 1416 Type *VArgType = getVecTypeForPair(ArgType); 1417 1418 // Get the total number of elements in the fused vector type. 1419 // By definition, this must equal the number of elements in 1420 // the final mask. 1421 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 1422 std::vector<Constant*> Mask(NumElem); 1423 1424 Type *OpType = I->getOperand(0)->getType(); 1425 unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); 1426 1427 // For the mask from the first pair... 1428 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); 1429 1430 // For the mask from the second pair... 1431 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, 1432 Mask); 1433 1434 return ConstantVector::get(Mask); 1435 } 1436 1437 // Returns the value to be used as the specified operand of the vector 1438 // instruction that fuses I with J. 1439 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 1440 Instruction *J, unsigned o, bool FlipMemInputs) { 1441 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1442 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1443 1444 // Compute the fused vector type for this operand 1445 Type *ArgType = I->getOperand(o)->getType(); 1446 VectorType *VArgType = getVecTypeForPair(ArgType); 1447 1448 Instruction *L = I, *H = J; 1449 if (FlipMemInputs) { 1450 L = J; 1451 H = I; 1452 } 1453 1454 if (ArgType->isVectorTy()) { 1455 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 1456 std::vector<Constant*> Mask(numElem); 1457 for (unsigned v = 0; v < numElem; ++v) 1458 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1459 1460 Instruction *BV = new ShuffleVectorInst(L->getOperand(o), 1461 H->getOperand(o), 1462 ConstantVector::get(Mask), 1463 getReplacementName(I, true, o)); 1464 BV->insertBefore(J); 1465 return BV; 1466 } 1467 1468 // If these two inputs are the output of another vector instruction, 1469 // then we should use that output directly. It might be necessary to 1470 // permute it first. [When pairings are fused recursively, you can 1471 // end up with cases where a large vector is decomposed into scalars 1472 // using extractelement instructions, then built into size-2 1473 // vectors using insertelement and the into larger vectors using 1474 // shuffles. InstCombine does not simplify all of these cases well, 1475 // and so we make sure that shuffles are generated here when possible. 1476 ExtractElementInst *LEE 1477 = dyn_cast<ExtractElementInst>(L->getOperand(o)); 1478 ExtractElementInst *HEE 1479 = dyn_cast<ExtractElementInst>(H->getOperand(o)); 1480 1481 if (LEE && HEE && 1482 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { 1483 VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); 1484 unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); 1485 unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); 1486 if (LEE->getOperand(0) == HEE->getOperand(0)) { 1487 if (LowIndx == 0 && HighIndx == 1) 1488 return LEE->getOperand(0); 1489 1490 std::vector<Constant*> Mask(2); 1491 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1492 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1493 1494 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1495 UndefValue::get(EEType), 1496 ConstantVector::get(Mask), 1497 getReplacementName(I, true, o)); 1498 BV->insertBefore(J); 1499 return BV; 1500 } 1501 1502 std::vector<Constant*> Mask(2); 1503 HighIndx += EEType->getNumElements(); 1504 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1505 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1506 1507 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1508 HEE->getOperand(0), 1509 ConstantVector::get(Mask), 1510 getReplacementName(I, true, o)); 1511 BV->insertBefore(J); 1512 return BV; 1513 } 1514 1515 Instruction *BV1 = InsertElementInst::Create( 1516 UndefValue::get(VArgType), 1517 L->getOperand(o), CV0, 1518 getReplacementName(I, true, o, 1)); 1519 BV1->insertBefore(I); 1520 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), 1521 CV1, 1522 getReplacementName(I, true, o, 2)); 1523 BV2->insertBefore(J); 1524 return BV2; 1525 } 1526 1527 // This function creates an array of values that will be used as the inputs 1528 // to the vector instruction that fuses I with J. 1529 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 1530 Instruction *I, Instruction *J, 1531 SmallVector<Value *, 3> &ReplacedOperands, 1532 bool &FlipMemInputs) { 1533 FlipMemInputs = false; 1534 unsigned NumOperands = I->getNumOperands(); 1535 1536 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 1537 // Iterate backward so that we look at the store pointer 1538 // first and know whether or not we need to flip the inputs. 1539 1540 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 1541 // This is the pointer for a load/store instruction. 1542 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, 1543 FlipMemInputs); 1544 continue; 1545 } else if (isa<CallInst>(I)) { 1546 Function *F = cast<CallInst>(I)->getCalledFunction(); 1547 unsigned IID = F->getIntrinsicID(); 1548 if (o == NumOperands-1) { 1549 BasicBlock &BB = *I->getParent(); 1550 1551 Module *M = BB.getParent()->getParent(); 1552 Type *ArgType = I->getType(); 1553 Type *VArgType = getVecTypeForPair(ArgType); 1554 1555 // FIXME: is it safe to do this here? 1556 ReplacedOperands[o] = Intrinsic::getDeclaration(M, 1557 (Intrinsic::ID) IID, VArgType); 1558 continue; 1559 } else if (IID == Intrinsic::powi && o == 1) { 1560 // The second argument of powi is a single integer and we've already 1561 // checked that both arguments are equal. As a result, we just keep 1562 // I's second argument. 1563 ReplacedOperands[o] = I->getOperand(o); 1564 continue; 1565 } 1566 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 1567 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 1568 continue; 1569 } 1570 1571 ReplacedOperands[o] = 1572 getReplacementInput(Context, I, J, o, FlipMemInputs); 1573 } 1574 } 1575 1576 // This function creates two values that represent the outputs of the 1577 // original I and J instructions. These are generally vector shuffles 1578 // or extracts. In many cases, these will end up being unused and, thus, 1579 // eliminated by later passes. 1580 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 1581 Instruction *J, Instruction *K, 1582 Instruction *&InsertionPt, 1583 Instruction *&K1, Instruction *&K2, 1584 bool &FlipMemInputs) { 1585 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1586 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1587 1588 if (isa<StoreInst>(I)) { 1589 AA->replaceWithNewValue(I, K); 1590 AA->replaceWithNewValue(J, K); 1591 } else { 1592 Type *IType = I->getType(); 1593 Type *VType = getVecTypeForPair(IType); 1594 1595 if (IType->isVectorTy()) { 1596 unsigned numElem = cast<VectorType>(IType)->getNumElements(); 1597 std::vector<Constant*> Mask1(numElem), Mask2(numElem); 1598 for (unsigned v = 0; v < numElem; ++v) { 1599 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1600 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); 1601 } 1602 1603 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 1604 ConstantVector::get( 1605 FlipMemInputs ? Mask2 : Mask1), 1606 getReplacementName(K, false, 1)); 1607 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 1608 ConstantVector::get( 1609 FlipMemInputs ? Mask1 : Mask2), 1610 getReplacementName(K, false, 2)); 1611 } else { 1612 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, 1613 getReplacementName(K, false, 1)); 1614 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, 1615 getReplacementName(K, false, 2)); 1616 } 1617 1618 K1->insertAfter(K); 1619 K2->insertAfter(K1); 1620 InsertionPt = K2; 1621 } 1622 } 1623 1624 // Move all uses of the function I (including pairing-induced uses) after J. 1625 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 1626 std::multimap<Value *, Value *> &LoadMoveSet, 1627 Instruction *I, Instruction *J) { 1628 // Skip to the first instruction past I. 1629 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1630 1631 DenseSet<Value *> Users; 1632 AliasSetTracker WriteSet(*AA); 1633 for (; cast<Instruction>(L) != J; ++L) 1634 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); 1635 1636 assert(cast<Instruction>(L) == J && 1637 "Tracking has not proceeded far enough to check for dependencies"); 1638 // If J is now in the use set of I, then trackUsesOfI will return true 1639 // and we have a dependency cycle (and the fusing operation must abort). 1640 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); 1641 } 1642 1643 // Move all uses of the function I (including pairing-induced uses) after J. 1644 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 1645 std::multimap<Value *, Value *> &LoadMoveSet, 1646 Instruction *&InsertionPt, 1647 Instruction *I, Instruction *J) { 1648 // Skip to the first instruction past I. 1649 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1650 1651 DenseSet<Value *> Users; 1652 AliasSetTracker WriteSet(*AA); 1653 for (; cast<Instruction>(L) != J;) { 1654 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { 1655 // Move this instruction 1656 Instruction *InstToMove = L; ++L; 1657 1658 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 1659 " to after " << *InsertionPt << "\n"); 1660 InstToMove->removeFromParent(); 1661 InstToMove->insertAfter(InsertionPt); 1662 InsertionPt = InstToMove; 1663 } else { 1664 ++L; 1665 } 1666 } 1667 } 1668 1669 // Collect all load instruction that are in the move set of a given first 1670 // pair member. These loads depend on the first instruction, I, and so need 1671 // to be moved after J (the second instruction) when the pair is fused. 1672 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 1673 DenseMap<Value *, Value *> &ChosenPairs, 1674 std::multimap<Value *, Value *> &LoadMoveSet, 1675 Instruction *I) { 1676 // Skip to the first instruction past I. 1677 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1678 1679 DenseSet<Value *> Users; 1680 AliasSetTracker WriteSet(*AA); 1681 1682 // Note: We cannot end the loop when we reach J because J could be moved 1683 // farther down the use chain by another instruction pairing. Also, J 1684 // could be before I if this is an inverted input. 1685 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 1686 if (trackUsesOfI(Users, WriteSet, I, L)) { 1687 if (L->mayReadFromMemory()) 1688 LoadMoveSet.insert(ValuePair(L, I)); 1689 } 1690 } 1691 } 1692 1693 // In cases where both load/stores and the computation of their pointers 1694 // are chosen for vectorization, we can end up in a situation where the 1695 // aliasing analysis starts returning different query results as the 1696 // process of fusing instruction pairs continues. Because the algorithm 1697 // relies on finding the same use trees here as were found earlier, we'll 1698 // need to precompute the necessary aliasing information here and then 1699 // manually update it during the fusion process. 1700 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 1701 std::vector<Value *> &PairableInsts, 1702 DenseMap<Value *, Value *> &ChosenPairs, 1703 std::multimap<Value *, Value *> &LoadMoveSet) { 1704 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1705 PIE = PairableInsts.end(); PI != PIE; ++PI) { 1706 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 1707 if (P == ChosenPairs.end()) continue; 1708 1709 Instruction *I = cast<Instruction>(P->first); 1710 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); 1711 } 1712 } 1713 1714 // This function fuses the chosen instruction pairs into vector instructions, 1715 // taking care preserve any needed scalar outputs and, then, it reorders the 1716 // remaining instructions as needed (users of the first member of the pair 1717 // need to be moved to after the location of the second member of the pair 1718 // because the vector instruction is inserted in the location of the pair's 1719 // second member). 1720 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 1721 std::vector<Value *> &PairableInsts, 1722 DenseMap<Value *, Value *> &ChosenPairs) { 1723 LLVMContext& Context = BB.getContext(); 1724 1725 // During the vectorization process, the order of the pairs to be fused 1726 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 1727 // list. After a pair is fused, the flipped pair is removed from the list. 1728 std::vector<ValuePair> FlippedPairs; 1729 FlippedPairs.reserve(ChosenPairs.size()); 1730 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 1731 E = ChosenPairs.end(); P != E; ++P) 1732 FlippedPairs.push_back(ValuePair(P->second, P->first)); 1733 for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), 1734 E = FlippedPairs.end(); P != E; ++P) 1735 ChosenPairs.insert(*P); 1736 1737 std::multimap<Value *, Value *> LoadMoveSet; 1738 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); 1739 1740 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 1741 1742 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 1743 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 1744 if (P == ChosenPairs.end()) { 1745 ++PI; 1746 continue; 1747 } 1748 1749 if (getDepthFactor(P->first) == 0) { 1750 // These instructions are not really fused, but are tracked as though 1751 // they are. Any case in which it would be interesting to fuse them 1752 // will be taken care of by InstCombine. 1753 --NumFusedOps; 1754 ++PI; 1755 continue; 1756 } 1757 1758 Instruction *I = cast<Instruction>(P->first), 1759 *J = cast<Instruction>(P->second); 1760 1761 DEBUG(dbgs() << "BBV: fusing: " << *I << 1762 " <-> " << *J << "\n"); 1763 1764 // Remove the pair and flipped pair from the list. 1765 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 1766 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 1767 ChosenPairs.erase(FP); 1768 ChosenPairs.erase(P); 1769 1770 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { 1771 DEBUG(dbgs() << "BBV: fusion of: " << *I << 1772 " <-> " << *J << 1773 " aborted because of non-trivial dependency cycle\n"); 1774 --NumFusedOps; 1775 ++PI; 1776 continue; 1777 } 1778 1779 bool FlipMemInputs; 1780 unsigned NumOperands = I->getNumOperands(); 1781 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 1782 getReplacementInputsForPair(Context, I, J, ReplacedOperands, 1783 FlipMemInputs); 1784 1785 // Make a copy of the original operation, change its type to the vector 1786 // type and replace its operands with the vector operands. 1787 Instruction *K = I->clone(); 1788 if (I->hasName()) K->takeName(I); 1789 1790 if (!isa<StoreInst>(K)) 1791 K->mutateType(getVecTypeForPair(I->getType())); 1792 1793 for (unsigned o = 0; o < NumOperands; ++o) 1794 K->setOperand(o, ReplacedOperands[o]); 1795 1796 // If we've flipped the memory inputs, make sure that we take the correct 1797 // alignment. 1798 if (FlipMemInputs) { 1799 if (isa<StoreInst>(K)) 1800 cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); 1801 else 1802 cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); 1803 } 1804 1805 K->insertAfter(J); 1806 1807 // Instruction insertion point: 1808 Instruction *InsertionPt = K; 1809 Instruction *K1 = 0, *K2 = 0; 1810 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, 1811 FlipMemInputs); 1812 1813 // The use tree of the first original instruction must be moved to after 1814 // the location of the second instruction. The entire use tree of the 1815 // first instruction is disjoint from the input tree of the second 1816 // (by definition), and so commutes with it. 1817 1818 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); 1819 1820 if (!isa<StoreInst>(I)) { 1821 I->replaceAllUsesWith(K1); 1822 J->replaceAllUsesWith(K2); 1823 AA->replaceWithNewValue(I, K1); 1824 AA->replaceWithNewValue(J, K2); 1825 } 1826 1827 // Instructions that may read from memory may be in the load move set. 1828 // Once an instruction is fused, we no longer need its move set, and so 1829 // the values of the map never need to be updated. However, when a load 1830 // is fused, we need to merge the entries from both instructions in the 1831 // pair in case those instructions were in the move set of some other 1832 // yet-to-be-fused pair. The loads in question are the keys of the map. 1833 if (I->mayReadFromMemory()) { 1834 std::vector<ValuePair> NewSetMembers; 1835 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); 1836 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); 1837 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; 1838 N != IPairRange.second; ++N) 1839 NewSetMembers.push_back(ValuePair(K, N->second)); 1840 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; 1841 N != JPairRange.second; ++N) 1842 NewSetMembers.push_back(ValuePair(K, N->second)); 1843 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 1844 AE = NewSetMembers.end(); A != AE; ++A) 1845 LoadMoveSet.insert(*A); 1846 } 1847 1848 // Before removing I, set the iterator to the next instruction. 1849 PI = llvm::next(BasicBlock::iterator(I)); 1850 if (cast<Instruction>(PI) == J) 1851 ++PI; 1852 1853 SE->forgetValue(I); 1854 SE->forgetValue(J); 1855 I->eraseFromParent(); 1856 J->eraseFromParent(); 1857 } 1858 1859 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 1860 } 1861} 1862 1863char BBVectorize::ID = 0; 1864static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 1865INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1866INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1867INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1868INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1869 1870BasicBlockPass *llvm::createBBVectorizePass() { 1871 return new BBVectorize(); 1872} 1873 1874bool llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB) { 1875 BBVectorize BBVectorizer(P); 1876 return BBVectorizer.vectorizeBB(BB); 1877} 1878