BBVectorize.cpp revision ec4e85e3364f50802f2007e4b1e23661d4610366
1//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a basic-block vectorization pass. The algorithm was 11// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12// et al. It works by looking for chains of pairable operations and then 13// pairing them. 14// 15//===----------------------------------------------------------------------===// 16 17#define BBV_NAME "bb-vectorize" 18#define DEBUG_TYPE BBV_NAME 19#include "llvm/Constants.h" 20#include "llvm/DerivedTypes.h" 21#include "llvm/Function.h" 22#include "llvm/Instructions.h" 23#include "llvm/IntrinsicInst.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/LLVMContext.h" 26#include "llvm/Metadata.h" 27#include "llvm/Pass.h" 28#include "llvm/Type.h" 29#include "llvm/ADT/DenseMap.h" 30#include "llvm/ADT/DenseSet.h" 31#include "llvm/ADT/SmallVector.h" 32#include "llvm/ADT/Statistic.h" 33#include "llvm/ADT/STLExtras.h" 34#include "llvm/ADT/StringExtras.h" 35#include "llvm/Analysis/AliasAnalysis.h" 36#include "llvm/Analysis/AliasSetTracker.h" 37#include "llvm/Analysis/ScalarEvolution.h" 38#include "llvm/Analysis/ScalarEvolutionExpressions.h" 39#include "llvm/Analysis/ValueTracking.h" 40#include "llvm/Support/CommandLine.h" 41#include "llvm/Support/Debug.h" 42#include "llvm/Support/raw_ostream.h" 43#include "llvm/Support/ValueHandle.h" 44#include "llvm/Target/TargetData.h" 45#include "llvm/Transforms/Vectorize.h" 46#include <algorithm> 47#include <map> 48using namespace llvm; 49 50static cl::opt<unsigned> 51ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 52 cl::desc("The required chain depth for vectorization")); 53 54static cl::opt<unsigned> 55SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 56 cl::desc("The maximum search distance for instruction pairs")); 57 58static cl::opt<bool> 59SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 60 cl::desc("Replicating one element to a pair breaks the chain")); 61 62static cl::opt<unsigned> 63VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 64 cl::desc("The size of the native vector registers")); 65 66static cl::opt<unsigned> 67MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 68 cl::desc("The maximum number of pairing iterations")); 69 70static cl::opt<unsigned> 71MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 72 cl::desc("The maximum number of pairable instructions per group")); 73 74static cl::opt<unsigned> 75MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 76 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 77 " a full cycle check")); 78 79static cl::opt<bool> 80NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 81 cl::desc("Don't try to vectorize boolean (i1) values")); 82 83static cl::opt<bool> 84NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 85 cl::desc("Don't try to vectorize integer values")); 86 87static cl::opt<bool> 88NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 89 cl::desc("Don't try to vectorize floating-point values")); 90 91static cl::opt<bool> 92NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, 93 cl::desc("Don't try to vectorize pointer values")); 94 95static cl::opt<bool> 96NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 97 cl::desc("Don't try to vectorize casting (conversion) operations")); 98 99static cl::opt<bool> 100NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 101 cl::desc("Don't try to vectorize floating-point math intrinsics")); 102 103static cl::opt<bool> 104NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 105 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 106 107static cl::opt<bool> 108NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 109 cl::desc("Don't try to vectorize select instructions")); 110 111static cl::opt<bool> 112NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 113 cl::desc("Don't try to vectorize comparison instructions")); 114 115static cl::opt<bool> 116NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 117 cl::desc("Don't try to vectorize getelementptr instructions")); 118 119static cl::opt<bool> 120NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 121 cl::desc("Don't try to vectorize loads and stores")); 122 123static cl::opt<bool> 124AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 125 cl::desc("Only generate aligned loads and stores")); 126 127static cl::opt<bool> 128NoMemOpBoost("bb-vectorize-no-mem-op-boost", 129 cl::init(false), cl::Hidden, 130 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 131 132static cl::opt<bool> 133FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 134 cl::desc("Use a fast instruction dependency analysis")); 135 136#ifndef NDEBUG 137static cl::opt<bool> 138DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 139 cl::init(false), cl::Hidden, 140 cl::desc("When debugging is enabled, output information on the" 141 " instruction-examination process")); 142static cl::opt<bool> 143DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 144 cl::init(false), cl::Hidden, 145 cl::desc("When debugging is enabled, output information on the" 146 " candidate-selection process")); 147static cl::opt<bool> 148DebugPairSelection("bb-vectorize-debug-pair-selection", 149 cl::init(false), cl::Hidden, 150 cl::desc("When debugging is enabled, output information on the" 151 " pair-selection process")); 152static cl::opt<bool> 153DebugCycleCheck("bb-vectorize-debug-cycle-check", 154 cl::init(false), cl::Hidden, 155 cl::desc("When debugging is enabled, output information on the" 156 " cycle-checking process")); 157#endif 158 159STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 160 161namespace { 162 struct BBVectorize : public BasicBlockPass { 163 static char ID; // Pass identification, replacement for typeid 164 165 const VectorizeConfig Config; 166 167 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 168 : BasicBlockPass(ID), Config(C) { 169 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 170 } 171 172 BBVectorize(Pass *P, const VectorizeConfig &C) 173 : BasicBlockPass(ID), Config(C) { 174 AA = &P->getAnalysis<AliasAnalysis>(); 175 SE = &P->getAnalysis<ScalarEvolution>(); 176 TD = P->getAnalysisIfAvailable<TargetData>(); 177 } 178 179 typedef std::pair<Value *, Value *> ValuePair; 180 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 181 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 182 typedef std::pair<std::multimap<Value *, Value *>::iterator, 183 std::multimap<Value *, Value *>::iterator> VPIteratorPair; 184 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, 185 std::multimap<ValuePair, ValuePair>::iterator> 186 VPPIteratorPair; 187 188 AliasAnalysis *AA; 189 ScalarEvolution *SE; 190 TargetData *TD; 191 192 // FIXME: const correct? 193 194 bool vectorizePairs(BasicBlock &BB); 195 196 bool getCandidatePairs(BasicBlock &BB, 197 BasicBlock::iterator &Start, 198 std::multimap<Value *, Value *> &CandidatePairs, 199 std::vector<Value *> &PairableInsts); 200 201 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, 202 std::vector<Value *> &PairableInsts, 203 std::multimap<ValuePair, ValuePair> &ConnectedPairs); 204 205 void buildDepMap(BasicBlock &BB, 206 std::multimap<Value *, Value *> &CandidatePairs, 207 std::vector<Value *> &PairableInsts, 208 DenseSet<ValuePair> &PairableInstUsers); 209 210 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, 211 std::vector<Value *> &PairableInsts, 212 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 213 DenseSet<ValuePair> &PairableInstUsers, 214 DenseMap<Value *, Value *>& ChosenPairs); 215 216 void fuseChosenPairs(BasicBlock &BB, 217 std::vector<Value *> &PairableInsts, 218 DenseMap<Value *, Value *>& ChosenPairs); 219 220 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 221 222 bool areInstsCompatible(Instruction *I, Instruction *J, 223 bool IsSimpleLoadStore); 224 225 bool trackUsesOfI(DenseSet<Value *> &Users, 226 AliasSetTracker &WriteSet, Instruction *I, 227 Instruction *J, bool UpdateUsers = true, 228 std::multimap<Value *, Value *> *LoadMoveSet = 0); 229 230 void computePairsConnectedTo( 231 std::multimap<Value *, Value *> &CandidatePairs, 232 std::vector<Value *> &PairableInsts, 233 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 234 ValuePair P); 235 236 bool pairsConflict(ValuePair P, ValuePair Q, 237 DenseSet<ValuePair> &PairableInstUsers, 238 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); 239 240 bool pairWillFormCycle(ValuePair P, 241 std::multimap<ValuePair, ValuePair> &PairableInstUsers, 242 DenseSet<ValuePair> &CurrentPairs); 243 244 void pruneTreeFor( 245 std::multimap<Value *, Value *> &CandidatePairs, 246 std::vector<Value *> &PairableInsts, 247 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 248 DenseSet<ValuePair> &PairableInstUsers, 249 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 250 DenseMap<Value *, Value *> &ChosenPairs, 251 DenseMap<ValuePair, size_t> &Tree, 252 DenseSet<ValuePair> &PrunedTree, ValuePair J, 253 bool UseCycleCheck); 254 255 void buildInitialTreeFor( 256 std::multimap<Value *, Value *> &CandidatePairs, 257 std::vector<Value *> &PairableInsts, 258 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 259 DenseSet<ValuePair> &PairableInstUsers, 260 DenseMap<Value *, Value *> &ChosenPairs, 261 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 262 263 void findBestTreeFor( 264 std::multimap<Value *, Value *> &CandidatePairs, 265 std::vector<Value *> &PairableInsts, 266 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 267 DenseSet<ValuePair> &PairableInstUsers, 268 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 269 DenseMap<Value *, Value *> &ChosenPairs, 270 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 271 size_t &BestEffSize, VPIteratorPair ChoiceRange, 272 bool UseCycleCheck); 273 274 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 275 Instruction *J, unsigned o, bool &FlipMemInputs); 276 277 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 278 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 279 unsigned IdxOffset, std::vector<Constant*> &Mask); 280 281 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 282 Instruction *J); 283 284 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 285 Instruction *J, unsigned o, bool FlipMemInputs); 286 287 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 288 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 289 bool &FlipMemInputs); 290 291 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 292 Instruction *J, Instruction *K, 293 Instruction *&InsertionPt, Instruction *&K1, 294 Instruction *&K2, bool &FlipMemInputs); 295 296 void collectPairLoadMoveSet(BasicBlock &BB, 297 DenseMap<Value *, Value *> &ChosenPairs, 298 std::multimap<Value *, Value *> &LoadMoveSet, 299 Instruction *I); 300 301 void collectLoadMoveSet(BasicBlock &BB, 302 std::vector<Value *> &PairableInsts, 303 DenseMap<Value *, Value *> &ChosenPairs, 304 std::multimap<Value *, Value *> &LoadMoveSet); 305 306 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 307 std::multimap<Value *, Value *> &LoadMoveSet, 308 Instruction *I, Instruction *J); 309 310 void moveUsesOfIAfterJ(BasicBlock &BB, 311 std::multimap<Value *, Value *> &LoadMoveSet, 312 Instruction *&InsertionPt, 313 Instruction *I, Instruction *J); 314 315 void combineMetadata(Instruction *K, const Instruction *J); 316 317 bool vectorizeBB(BasicBlock &BB) { 318 bool changed = false; 319 // Iterate a sufficient number of times to merge types of size 1 bit, 320 // then 2 bits, then 4, etc. up to half of the target vector width of the 321 // target vector register. 322 for (unsigned v = 2, n = 1; 323 v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); 324 v *= 2, ++n) { 325 DEBUG(dbgs() << "BBV: fusing loop #" << n << 326 " for " << BB.getName() << " in " << 327 BB.getParent()->getName() << "...\n"); 328 if (vectorizePairs(BB)) 329 changed = true; 330 else 331 break; 332 } 333 334 DEBUG(dbgs() << "BBV: done!\n"); 335 return changed; 336 } 337 338 virtual bool runOnBasicBlock(BasicBlock &BB) { 339 AA = &getAnalysis<AliasAnalysis>(); 340 SE = &getAnalysis<ScalarEvolution>(); 341 TD = getAnalysisIfAvailable<TargetData>(); 342 343 return vectorizeBB(BB); 344 } 345 346 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 347 BasicBlockPass::getAnalysisUsage(AU); 348 AU.addRequired<AliasAnalysis>(); 349 AU.addRequired<ScalarEvolution>(); 350 AU.addPreserved<AliasAnalysis>(); 351 AU.addPreserved<ScalarEvolution>(); 352 AU.setPreservesCFG(); 353 } 354 355 // This returns the vector type that holds a pair of the provided type. 356 // If the provided type is already a vector, then its length is doubled. 357 static inline VectorType *getVecTypeForPair(Type *ElemTy) { 358 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 359 unsigned numElem = VTy->getNumElements(); 360 return VectorType::get(ElemTy->getScalarType(), numElem*2); 361 } 362 363 return VectorType::get(ElemTy, 2); 364 } 365 366 // Returns the weight associated with the provided value. A chain of 367 // candidate pairs has a length given by the sum of the weights of its 368 // members (one weight per pair; the weight of each member of the pair 369 // is assumed to be the same). This length is then compared to the 370 // chain-length threshold to determine if a given chain is significant 371 // enough to be vectorized. The length is also used in comparing 372 // candidate chains where longer chains are considered to be better. 373 // Note: when this function returns 0, the resulting instructions are 374 // not actually fused. 375 inline size_t getDepthFactor(Value *V) { 376 // InsertElement and ExtractElement have a depth factor of zero. This is 377 // for two reasons: First, they cannot be usefully fused. Second, because 378 // the pass generates a lot of these, they can confuse the simple metric 379 // used to compare the trees in the next iteration. Thus, giving them a 380 // weight of zero allows the pass to essentially ignore them in 381 // subsequent iterations when looking for vectorization opportunities 382 // while still tracking dependency chains that flow through those 383 // instructions. 384 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 385 return 0; 386 387 // Give a load or store half of the required depth so that load/store 388 // pairs will vectorize. 389 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 390 return Config.ReqChainDepth/2; 391 392 return 1; 393 } 394 395 // This determines the relative offset of two loads or stores, returning 396 // true if the offset could be determined to be some constant value. 397 // For example, if OffsetInElmts == 1, then J accesses the memory directly 398 // after I; if OffsetInElmts == -1 then I accesses the memory 399 // directly after J. This function assumes that both instructions 400 // have the same type. 401 bool getPairPtrInfo(Instruction *I, Instruction *J, 402 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 403 int64_t &OffsetInElmts) { 404 OffsetInElmts = 0; 405 if (isa<LoadInst>(I)) { 406 IPtr = cast<LoadInst>(I)->getPointerOperand(); 407 JPtr = cast<LoadInst>(J)->getPointerOperand(); 408 IAlignment = cast<LoadInst>(I)->getAlignment(); 409 JAlignment = cast<LoadInst>(J)->getAlignment(); 410 } else { 411 IPtr = cast<StoreInst>(I)->getPointerOperand(); 412 JPtr = cast<StoreInst>(J)->getPointerOperand(); 413 IAlignment = cast<StoreInst>(I)->getAlignment(); 414 JAlignment = cast<StoreInst>(J)->getAlignment(); 415 } 416 417 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 418 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 419 420 // If this is a trivial offset, then we'll get something like 421 // 1*sizeof(type). With target data, which we need anyway, this will get 422 // constant folded into a number. 423 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 424 if (const SCEVConstant *ConstOffSCEV = 425 dyn_cast<SCEVConstant>(OffsetSCEV)) { 426 ConstantInt *IntOff = ConstOffSCEV->getValue(); 427 int64_t Offset = IntOff->getSExtValue(); 428 429 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 430 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 431 432 assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); 433 434 OffsetInElmts = Offset/VTyTSS; 435 return (abs64(Offset) % VTyTSS) == 0; 436 } 437 438 return false; 439 } 440 441 // Returns true if the provided CallInst represents an intrinsic that can 442 // be vectorized. 443 bool isVectorizableIntrinsic(CallInst* I) { 444 Function *F = I->getCalledFunction(); 445 if (!F) return false; 446 447 unsigned IID = F->getIntrinsicID(); 448 if (!IID) return false; 449 450 switch(IID) { 451 default: 452 return false; 453 case Intrinsic::sqrt: 454 case Intrinsic::powi: 455 case Intrinsic::sin: 456 case Intrinsic::cos: 457 case Intrinsic::log: 458 case Intrinsic::log2: 459 case Intrinsic::log10: 460 case Intrinsic::exp: 461 case Intrinsic::exp2: 462 case Intrinsic::pow: 463 return Config.VectorizeMath; 464 case Intrinsic::fma: 465 return Config.VectorizeFMA; 466 } 467 } 468 469 // Returns true if J is the second element in some pair referenced by 470 // some multimap pair iterator pair. 471 template <typename V> 472 bool isSecondInIteratorPair(V J, std::pair< 473 typename std::multimap<V, V>::iterator, 474 typename std::multimap<V, V>::iterator> PairRange) { 475 for (typename std::multimap<V, V>::iterator K = PairRange.first; 476 K != PairRange.second; ++K) 477 if (K->second == J) return true; 478 479 return false; 480 } 481 }; 482 483 // This function implements one vectorization iteration on the provided 484 // basic block. It returns true if the block is changed. 485 bool BBVectorize::vectorizePairs(BasicBlock &BB) { 486 bool ShouldContinue; 487 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 488 489 std::vector<Value *> AllPairableInsts; 490 DenseMap<Value *, Value *> AllChosenPairs; 491 492 do { 493 std::vector<Value *> PairableInsts; 494 std::multimap<Value *, Value *> CandidatePairs; 495 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 496 PairableInsts); 497 if (PairableInsts.empty()) continue; 498 499 // Now we have a map of all of the pairable instructions and we need to 500 // select the best possible pairing. A good pairing is one such that the 501 // users of the pair are also paired. This defines a (directed) forest 502 // over the pairs such that two pairs are connected iff the second pair 503 // uses the first. 504 505 // Note that it only matters that both members of the second pair use some 506 // element of the first pair (to allow for splatting). 507 508 std::multimap<ValuePair, ValuePair> ConnectedPairs; 509 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); 510 if (ConnectedPairs.empty()) continue; 511 512 // Build the pairable-instruction dependency map 513 DenseSet<ValuePair> PairableInstUsers; 514 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 515 516 // There is now a graph of the connected pairs. For each variable, pick 517 // the pairing with the largest tree meeting the depth requirement on at 518 // least one branch. Then select all pairings that are part of that tree 519 // and remove them from the list of available pairings and pairable 520 // variables. 521 522 DenseMap<Value *, Value *> ChosenPairs; 523 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, 524 PairableInstUsers, ChosenPairs); 525 526 if (ChosenPairs.empty()) continue; 527 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 528 PairableInsts.end()); 529 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 530 } while (ShouldContinue); 531 532 if (AllChosenPairs.empty()) return false; 533 NumFusedOps += AllChosenPairs.size(); 534 535 // A set of pairs has now been selected. It is now necessary to replace the 536 // paired instructions with vector instructions. For this procedure each 537 // operand must be replaced with a vector operand. This vector is formed 538 // by using build_vector on the old operands. The replaced values are then 539 // replaced with a vector_extract on the result. Subsequent optimization 540 // passes should coalesce the build/extract combinations. 541 542 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); 543 return true; 544 } 545 546 // This function returns true if the provided instruction is capable of being 547 // fused into a vector instruction. This determination is based only on the 548 // type and other attributes of the instruction. 549 bool BBVectorize::isInstVectorizable(Instruction *I, 550 bool &IsSimpleLoadStore) { 551 IsSimpleLoadStore = false; 552 553 if (CallInst *C = dyn_cast<CallInst>(I)) { 554 if (!isVectorizableIntrinsic(C)) 555 return false; 556 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 557 // Vectorize simple loads if possbile: 558 IsSimpleLoadStore = L->isSimple(); 559 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 560 return false; 561 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 562 // Vectorize simple stores if possbile: 563 IsSimpleLoadStore = S->isSimple(); 564 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 565 return false; 566 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 567 // We can vectorize casts, but not casts of pointer types, etc. 568 if (!Config.VectorizeCasts) 569 return false; 570 571 Type *SrcTy = C->getSrcTy(); 572 if (!SrcTy->isSingleValueType()) 573 return false; 574 575 Type *DestTy = C->getDestTy(); 576 if (!DestTy->isSingleValueType()) 577 return false; 578 } else if (isa<SelectInst>(I)) { 579 if (!Config.VectorizeSelect) 580 return false; 581 } else if (isa<CmpInst>(I)) { 582 if (!Config.VectorizeCmp) 583 return false; 584 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 585 if (!Config.VectorizeGEP) 586 return false; 587 588 // Currently, vector GEPs exist only with one index. 589 if (G->getNumIndices() != 1) 590 return false; 591 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 592 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 593 return false; 594 } 595 596 // We can't vectorize memory operations without target data 597 if (TD == 0 && IsSimpleLoadStore) 598 return false; 599 600 Type *T1, *T2; 601 if (isa<StoreInst>(I)) { 602 // For stores, it is the value type, not the pointer type that matters 603 // because the value is what will come from a vector register. 604 605 Value *IVal = cast<StoreInst>(I)->getValueOperand(); 606 T1 = IVal->getType(); 607 } else { 608 T1 = I->getType(); 609 } 610 611 if (I->isCast()) 612 T2 = cast<CastInst>(I)->getSrcTy(); 613 else 614 T2 = T1; 615 616 // Not every type can be vectorized... 617 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 618 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 619 return false; 620 621 if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { 622 if (!Config.VectorizeBools) 623 return false; 624 } else { 625 if (!Config.VectorizeInts 626 && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) 627 return false; 628 } 629 630 if (!Config.VectorizeFloats 631 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 632 return false; 633 634 // Don't vectorize target-specific types. 635 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 636 return false; 637 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 638 return false; 639 640 if ((!Config.VectorizePointers || TD == 0) && 641 (T1->getScalarType()->isPointerTy() || 642 T2->getScalarType()->isPointerTy())) 643 return false; 644 645 if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || 646 T2->getPrimitiveSizeInBits() > Config.VectorBits/2) 647 return false; 648 649 return true; 650 } 651 652 // This function returns true if the two provided instructions are compatible 653 // (meaning that they can be fused into a vector instruction). This assumes 654 // that I has already been determined to be vectorizable and that J is not 655 // in the use tree of I. 656 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 657 bool IsSimpleLoadStore) { 658 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 659 " <-> " << *J << "\n"); 660 661 // Loads and stores can be merged if they have different alignments, 662 // but are otherwise the same. 663 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment)) 664 return false; 665 666 // FIXME: handle addsub-type operations! 667 668 if (IsSimpleLoadStore) { 669 Value *IPtr, *JPtr; 670 unsigned IAlignment, JAlignment; 671 int64_t OffsetInElmts = 0; 672 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 673 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 674 if (Config.AlignedOnly) { 675 Type *aType = isa<StoreInst>(I) ? 676 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 677 // An aligned load or store is possible only if the instruction 678 // with the lower offset has an alignment suitable for the 679 // vector type. 680 681 unsigned BottomAlignment = IAlignment; 682 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 683 684 Type *VType = getVecTypeForPair(aType); 685 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 686 if (BottomAlignment < VecAlignment) 687 return false; 688 } 689 } else { 690 return false; 691 } 692 } else if (isa<ShuffleVectorInst>(I)) { 693 // Only merge two shuffles if they're both constant 694 return isa<Constant>(I->getOperand(2)) && 695 isa<Constant>(J->getOperand(2)); 696 // FIXME: We may want to vectorize non-constant shuffles also. 697 } 698 699 // The powi intrinsic is special because only the first argument is 700 // vectorized, the second arguments must be equal. 701 CallInst *CI = dyn_cast<CallInst>(I); 702 Function *FI; 703 if (CI && (FI = CI->getCalledFunction()) && 704 FI->getIntrinsicID() == Intrinsic::powi) { 705 706 Value *A1I = CI->getArgOperand(1), 707 *A1J = cast<CallInst>(J)->getArgOperand(1); 708 const SCEV *A1ISCEV = SE->getSCEV(A1I), 709 *A1JSCEV = SE->getSCEV(A1J); 710 return (A1ISCEV == A1JSCEV); 711 } 712 713 return true; 714 } 715 716 // Figure out whether or not J uses I and update the users and write-set 717 // structures associated with I. Specifically, Users represents the set of 718 // instructions that depend on I. WriteSet represents the set 719 // of memory locations that are dependent on I. If UpdateUsers is true, 720 // and J uses I, then Users is updated to contain J and WriteSet is updated 721 // to contain any memory locations to which J writes. The function returns 722 // true if J uses I. By default, alias analysis is used to determine 723 // whether J reads from memory that overlaps with a location in WriteSet. 724 // If LoadMoveSet is not null, then it is a previously-computed multimap 725 // where the key is the memory-based user instruction and the value is 726 // the instruction to be compared with I. So, if LoadMoveSet is provided, 727 // then the alias analysis is not used. This is necessary because this 728 // function is called during the process of moving instructions during 729 // vectorization and the results of the alias analysis are not stable during 730 // that process. 731 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 732 AliasSetTracker &WriteSet, Instruction *I, 733 Instruction *J, bool UpdateUsers, 734 std::multimap<Value *, Value *> *LoadMoveSet) { 735 bool UsesI = false; 736 737 // This instruction may already be marked as a user due, for example, to 738 // being a member of a selected pair. 739 if (Users.count(J)) 740 UsesI = true; 741 742 if (!UsesI) 743 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 744 JU != JE; ++JU) { 745 Value *V = *JU; 746 if (I == V || Users.count(V)) { 747 UsesI = true; 748 break; 749 } 750 } 751 if (!UsesI && J->mayReadFromMemory()) { 752 if (LoadMoveSet) { 753 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); 754 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); 755 } else { 756 for (AliasSetTracker::iterator W = WriteSet.begin(), 757 WE = WriteSet.end(); W != WE; ++W) { 758 if (W->aliasesUnknownInst(J, *AA)) { 759 UsesI = true; 760 break; 761 } 762 } 763 } 764 } 765 766 if (UsesI && UpdateUsers) { 767 if (J->mayWriteToMemory()) WriteSet.add(J); 768 Users.insert(J); 769 } 770 771 return UsesI; 772 } 773 774 // This function iterates over all instruction pairs in the provided 775 // basic block and collects all candidate pairs for vectorization. 776 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 777 BasicBlock::iterator &Start, 778 std::multimap<Value *, Value *> &CandidatePairs, 779 std::vector<Value *> &PairableInsts) { 780 BasicBlock::iterator E = BB.end(); 781 if (Start == E) return false; 782 783 bool ShouldContinue = false, IAfterStart = false; 784 for (BasicBlock::iterator I = Start++; I != E; ++I) { 785 if (I == Start) IAfterStart = true; 786 787 bool IsSimpleLoadStore; 788 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 789 790 // Look for an instruction with which to pair instruction *I... 791 DenseSet<Value *> Users; 792 AliasSetTracker WriteSet(*AA); 793 bool JAfterStart = IAfterStart; 794 BasicBlock::iterator J = llvm::next(I); 795 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 796 if (J == Start) JAfterStart = true; 797 798 // Determine if J uses I, if so, exit the loop. 799 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 800 if (Config.FastDep) { 801 // Note: For this heuristic to be effective, independent operations 802 // must tend to be intermixed. This is likely to be true from some 803 // kinds of grouped loop unrolling (but not the generic LLVM pass), 804 // but otherwise may require some kind of reordering pass. 805 806 // When using fast dependency analysis, 807 // stop searching after first use: 808 if (UsesI) break; 809 } else { 810 if (UsesI) continue; 811 } 812 813 // J does not use I, and comes before the first use of I, so it can be 814 // merged with I if the instructions are compatible. 815 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; 816 817 // J is a candidate for merging with I. 818 if (!PairableInsts.size() || 819 PairableInsts[PairableInsts.size()-1] != I) { 820 PairableInsts.push_back(I); 821 } 822 823 CandidatePairs.insert(ValuePair(I, J)); 824 825 // The next call to this function must start after the last instruction 826 // selected during this invocation. 827 if (JAfterStart) { 828 Start = llvm::next(J); 829 IAfterStart = JAfterStart = false; 830 } 831 832 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 833 << *I << " <-> " << *J << "\n"); 834 835 // If we have already found too many pairs, break here and this function 836 // will be called again starting after the last instruction selected 837 // during this invocation. 838 if (PairableInsts.size() >= Config.MaxInsts) { 839 ShouldContinue = true; 840 break; 841 } 842 } 843 844 if (ShouldContinue) 845 break; 846 } 847 848 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 849 << " instructions with candidate pairs\n"); 850 851 return ShouldContinue; 852 } 853 854 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 855 // it looks for pairs such that both members have an input which is an 856 // output of PI or PJ. 857 void BBVectorize::computePairsConnectedTo( 858 std::multimap<Value *, Value *> &CandidatePairs, 859 std::vector<Value *> &PairableInsts, 860 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 861 ValuePair P) { 862 StoreInst *SI, *SJ; 863 864 // For each possible pairing for this variable, look at the uses of 865 // the first value... 866 for (Value::use_iterator I = P.first->use_begin(), 867 E = P.first->use_end(); I != E; ++I) { 868 if (isa<LoadInst>(*I)) { 869 // A pair cannot be connected to a load because the load only takes one 870 // operand (the address) and it is a scalar even after vectorization. 871 continue; 872 } else if ((SI = dyn_cast<StoreInst>(*I)) && 873 P.first == SI->getPointerOperand()) { 874 // Similarly, a pair cannot be connected to a store through its 875 // pointer operand. 876 continue; 877 } 878 879 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 880 881 // For each use of the first variable, look for uses of the second 882 // variable... 883 for (Value::use_iterator J = P.second->use_begin(), 884 E2 = P.second->use_end(); J != E2; ++J) { 885 if ((SJ = dyn_cast<StoreInst>(*J)) && 886 P.second == SJ->getPointerOperand()) 887 continue; 888 889 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); 890 891 // Look for <I, J>: 892 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 893 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 894 895 // Look for <J, I>: 896 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) 897 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); 898 } 899 900 if (Config.SplatBreaksChain) continue; 901 // Look for cases where just the first value in the pair is used by 902 // both members of another pair (splatting). 903 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 904 if ((SJ = dyn_cast<StoreInst>(*J)) && 905 P.first == SJ->getPointerOperand()) 906 continue; 907 908 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 909 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 910 } 911 } 912 913 if (Config.SplatBreaksChain) return; 914 // Look for cases where just the second value in the pair is used by 915 // both members of another pair (splatting). 916 for (Value::use_iterator I = P.second->use_begin(), 917 E = P.second->use_end(); I != E; ++I) { 918 if (isa<LoadInst>(*I)) 919 continue; 920 else if ((SI = dyn_cast<StoreInst>(*I)) && 921 P.second == SI->getPointerOperand()) 922 continue; 923 924 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 925 926 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 927 if ((SJ = dyn_cast<StoreInst>(*J)) && 928 P.second == SJ->getPointerOperand()) 929 continue; 930 931 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 932 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 933 } 934 } 935 } 936 937 // This function figures out which pairs are connected. Two pairs are 938 // connected if some output of the first pair forms an input to both members 939 // of the second pair. 940 void BBVectorize::computeConnectedPairs( 941 std::multimap<Value *, Value *> &CandidatePairs, 942 std::vector<Value *> &PairableInsts, 943 std::multimap<ValuePair, ValuePair> &ConnectedPairs) { 944 945 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 946 PE = PairableInsts.end(); PI != PE; ++PI) { 947 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); 948 949 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; 950 P != choiceRange.second; ++P) 951 computePairsConnectedTo(CandidatePairs, PairableInsts, 952 ConnectedPairs, *P); 953 } 954 955 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() 956 << " pair connections.\n"); 957 } 958 959 // This function builds a set of use tuples such that <A, B> is in the set 960 // if B is in the use tree of A. If B is in the use tree of A, then B 961 // depends on the output of A. 962 void BBVectorize::buildDepMap( 963 BasicBlock &BB, 964 std::multimap<Value *, Value *> &CandidatePairs, 965 std::vector<Value *> &PairableInsts, 966 DenseSet<ValuePair> &PairableInstUsers) { 967 DenseSet<Value *> IsInPair; 968 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), 969 E = CandidatePairs.end(); C != E; ++C) { 970 IsInPair.insert(C->first); 971 IsInPair.insert(C->second); 972 } 973 974 // Iterate through the basic block, recording all Users of each 975 // pairable instruction. 976 977 BasicBlock::iterator E = BB.end(); 978 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 979 if (IsInPair.find(I) == IsInPair.end()) continue; 980 981 DenseSet<Value *> Users; 982 AliasSetTracker WriteSet(*AA); 983 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) 984 (void) trackUsesOfI(Users, WriteSet, I, J); 985 986 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 987 U != E; ++U) 988 PairableInstUsers.insert(ValuePair(I, *U)); 989 } 990 } 991 992 // Returns true if an input to pair P is an output of pair Q and also an 993 // input of pair Q is an output of pair P. If this is the case, then these 994 // two pairs cannot be simultaneously fused. 995 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 996 DenseSet<ValuePair> &PairableInstUsers, 997 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { 998 // Two pairs are in conflict if they are mutual Users of eachother. 999 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1000 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1001 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1002 PairableInstUsers.count(ValuePair(P.second, Q.second)); 1003 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1004 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1005 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1006 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1007 if (PairableInstUserMap) { 1008 // FIXME: The expensive part of the cycle check is not so much the cycle 1009 // check itself but this edge insertion procedure. This needs some 1010 // profiling and probably a different data structure (same is true of 1011 // most uses of std::multimap). 1012 if (PUsesQ) { 1013 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); 1014 if (!isSecondInIteratorPair(P, QPairRange)) 1015 PairableInstUserMap->insert(VPPair(Q, P)); 1016 } 1017 if (QUsesP) { 1018 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); 1019 if (!isSecondInIteratorPair(Q, PPairRange)) 1020 PairableInstUserMap->insert(VPPair(P, Q)); 1021 } 1022 } 1023 1024 return (QUsesP && PUsesQ); 1025 } 1026 1027 // This function walks the use graph of current pairs to see if, starting 1028 // from P, the walk returns to P. 1029 bool BBVectorize::pairWillFormCycle(ValuePair P, 1030 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1031 DenseSet<ValuePair> &CurrentPairs) { 1032 DEBUG(if (DebugCycleCheck) 1033 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1034 << *P.second << "\n"); 1035 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1036 // contains non-direct associations. 1037 DenseSet<ValuePair> Visited; 1038 SmallVector<ValuePair, 32> Q; 1039 // General depth-first post-order traversal: 1040 Q.push_back(P); 1041 do { 1042 ValuePair QTop = Q.pop_back_val(); 1043 Visited.insert(QTop); 1044 1045 DEBUG(if (DebugCycleCheck) 1046 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1047 << *QTop.second << "\n"); 1048 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); 1049 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; 1050 C != QPairRange.second; ++C) { 1051 if (C->second == P) { 1052 DEBUG(dbgs() 1053 << "BBV: rejected to prevent non-trivial cycle formation: " 1054 << *C->first.first << " <-> " << *C->first.second << "\n"); 1055 return true; 1056 } 1057 1058 if (CurrentPairs.count(C->second) && !Visited.count(C->second)) 1059 Q.push_back(C->second); 1060 } 1061 } while (!Q.empty()); 1062 1063 return false; 1064 } 1065 1066 // This function builds the initial tree of connected pairs with the 1067 // pair J at the root. 1068 void BBVectorize::buildInitialTreeFor( 1069 std::multimap<Value *, Value *> &CandidatePairs, 1070 std::vector<Value *> &PairableInsts, 1071 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1072 DenseSet<ValuePair> &PairableInstUsers, 1073 DenseMap<Value *, Value *> &ChosenPairs, 1074 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1075 // Each of these pairs is viewed as the root node of a Tree. The Tree 1076 // is then walked (depth-first). As this happens, we keep track of 1077 // the pairs that compose the Tree and the maximum depth of the Tree. 1078 SmallVector<ValuePairWithDepth, 32> Q; 1079 // General depth-first post-order traversal: 1080 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1081 do { 1082 ValuePairWithDepth QTop = Q.back(); 1083 1084 // Push each child onto the queue: 1085 bool MoreChildren = false; 1086 size_t MaxChildDepth = QTop.second; 1087 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); 1088 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; 1089 k != qtRange.second; ++k) { 1090 // Make sure that this child pair is still a candidate: 1091 bool IsStillCand = false; 1092 VPIteratorPair checkRange = 1093 CandidatePairs.equal_range(k->second.first); 1094 for (std::multimap<Value *, Value *>::iterator m = checkRange.first; 1095 m != checkRange.second; ++m) { 1096 if (m->second == k->second.second) { 1097 IsStillCand = true; 1098 break; 1099 } 1100 } 1101 1102 if (IsStillCand) { 1103 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); 1104 if (C == Tree.end()) { 1105 size_t d = getDepthFactor(k->second.first); 1106 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); 1107 MoreChildren = true; 1108 } else { 1109 MaxChildDepth = std::max(MaxChildDepth, C->second); 1110 } 1111 } 1112 } 1113 1114 if (!MoreChildren) { 1115 // Record the current pair as part of the Tree: 1116 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1117 Q.pop_back(); 1118 } 1119 } while (!Q.empty()); 1120 } 1121 1122 // Given some initial tree, prune it by removing conflicting pairs (pairs 1123 // that cannot be simultaneously chosen for vectorization). 1124 void BBVectorize::pruneTreeFor( 1125 std::multimap<Value *, Value *> &CandidatePairs, 1126 std::vector<Value *> &PairableInsts, 1127 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1128 DenseSet<ValuePair> &PairableInstUsers, 1129 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1130 DenseMap<Value *, Value *> &ChosenPairs, 1131 DenseMap<ValuePair, size_t> &Tree, 1132 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1133 bool UseCycleCheck) { 1134 SmallVector<ValuePairWithDepth, 32> Q; 1135 // General depth-first post-order traversal: 1136 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1137 do { 1138 ValuePairWithDepth QTop = Q.pop_back_val(); 1139 PrunedTree.insert(QTop.first); 1140 1141 // Visit each child, pruning as necessary... 1142 DenseMap<ValuePair, size_t> BestChildren; 1143 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); 1144 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; 1145 K != QTopRange.second; ++K) { 1146 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); 1147 if (C == Tree.end()) continue; 1148 1149 // This child is in the Tree, now we need to make sure it is the 1150 // best of any conflicting children. There could be multiple 1151 // conflicting children, so first, determine if we're keeping 1152 // this child, then delete conflicting children as necessary. 1153 1154 // It is also necessary to guard against pairing-induced 1155 // dependencies. Consider instructions a .. x .. y .. b 1156 // such that (a,b) are to be fused and (x,y) are to be fused 1157 // but a is an input to x and b is an output from y. This 1158 // means that y cannot be moved after b but x must be moved 1159 // after b for (a,b) to be fused. In other words, after 1160 // fusing (a,b) we have y .. a/b .. x where y is an input 1161 // to a/b and x is an output to a/b: x and y can no longer 1162 // be legally fused. To prevent this condition, we must 1163 // make sure that a child pair added to the Tree is not 1164 // both an input and output of an already-selected pair. 1165 1166 // Pairing-induced dependencies can also form from more complicated 1167 // cycles. The pair vs. pair conflicts are easy to check, and so 1168 // that is done explicitly for "fast rejection", and because for 1169 // child vs. child conflicts, we may prefer to keep the current 1170 // pair in preference to the already-selected child. 1171 DenseSet<ValuePair> CurrentPairs; 1172 1173 bool CanAdd = true; 1174 for (DenseMap<ValuePair, size_t>::iterator C2 1175 = BestChildren.begin(), E2 = BestChildren.end(); 1176 C2 != E2; ++C2) { 1177 if (C2->first.first == C->first.first || 1178 C2->first.first == C->first.second || 1179 C2->first.second == C->first.first || 1180 C2->first.second == C->first.second || 1181 pairsConflict(C2->first, C->first, PairableInstUsers, 1182 UseCycleCheck ? &PairableInstUserMap : 0)) { 1183 if (C2->second >= C->second) { 1184 CanAdd = false; 1185 break; 1186 } 1187 1188 CurrentPairs.insert(C2->first); 1189 } 1190 } 1191 if (!CanAdd) continue; 1192 1193 // Even worse, this child could conflict with another node already 1194 // selected for the Tree. If that is the case, ignore this child. 1195 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1196 E2 = PrunedTree.end(); T != E2; ++T) { 1197 if (T->first == C->first.first || 1198 T->first == C->first.second || 1199 T->second == C->first.first || 1200 T->second == C->first.second || 1201 pairsConflict(*T, C->first, PairableInstUsers, 1202 UseCycleCheck ? &PairableInstUserMap : 0)) { 1203 CanAdd = false; 1204 break; 1205 } 1206 1207 CurrentPairs.insert(*T); 1208 } 1209 if (!CanAdd) continue; 1210 1211 // And check the queue too... 1212 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1213 E2 = Q.end(); C2 != E2; ++C2) { 1214 if (C2->first.first == C->first.first || 1215 C2->first.first == C->first.second || 1216 C2->first.second == C->first.first || 1217 C2->first.second == C->first.second || 1218 pairsConflict(C2->first, C->first, PairableInstUsers, 1219 UseCycleCheck ? &PairableInstUserMap : 0)) { 1220 CanAdd = false; 1221 break; 1222 } 1223 1224 CurrentPairs.insert(C2->first); 1225 } 1226 if (!CanAdd) continue; 1227 1228 // Last but not least, check for a conflict with any of the 1229 // already-chosen pairs. 1230 for (DenseMap<Value *, Value *>::iterator C2 = 1231 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1232 C2 != E2; ++C2) { 1233 if (pairsConflict(*C2, C->first, PairableInstUsers, 1234 UseCycleCheck ? &PairableInstUserMap : 0)) { 1235 CanAdd = false; 1236 break; 1237 } 1238 1239 CurrentPairs.insert(*C2); 1240 } 1241 if (!CanAdd) continue; 1242 1243 // To check for non-trivial cycles formed by the addition of the 1244 // current pair we've formed a list of all relevant pairs, now use a 1245 // graph walk to check for a cycle. We start from the current pair and 1246 // walk the use tree to see if we again reach the current pair. If we 1247 // do, then the current pair is rejected. 1248 1249 // FIXME: It may be more efficient to use a topological-ordering 1250 // algorithm to improve the cycle check. This should be investigated. 1251 if (UseCycleCheck && 1252 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1253 continue; 1254 1255 // This child can be added, but we may have chosen it in preference 1256 // to an already-selected child. Check for this here, and if a 1257 // conflict is found, then remove the previously-selected child 1258 // before adding this one in its place. 1259 for (DenseMap<ValuePair, size_t>::iterator C2 1260 = BestChildren.begin(); C2 != BestChildren.end();) { 1261 if (C2->first.first == C->first.first || 1262 C2->first.first == C->first.second || 1263 C2->first.second == C->first.first || 1264 C2->first.second == C->first.second || 1265 pairsConflict(C2->first, C->first, PairableInstUsers)) 1266 BestChildren.erase(C2++); 1267 else 1268 ++C2; 1269 } 1270 1271 BestChildren.insert(ValuePairWithDepth(C->first, C->second)); 1272 } 1273 1274 for (DenseMap<ValuePair, size_t>::iterator C 1275 = BestChildren.begin(), E2 = BestChildren.end(); 1276 C != E2; ++C) { 1277 size_t DepthF = getDepthFactor(C->first.first); 1278 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1279 } 1280 } while (!Q.empty()); 1281 } 1282 1283 // This function finds the best tree of mututally-compatible connected 1284 // pairs, given the choice of root pairs as an iterator range. 1285 void BBVectorize::findBestTreeFor( 1286 std::multimap<Value *, Value *> &CandidatePairs, 1287 std::vector<Value *> &PairableInsts, 1288 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1289 DenseSet<ValuePair> &PairableInstUsers, 1290 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1291 DenseMap<Value *, Value *> &ChosenPairs, 1292 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1293 size_t &BestEffSize, VPIteratorPair ChoiceRange, 1294 bool UseCycleCheck) { 1295 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; 1296 J != ChoiceRange.second; ++J) { 1297 1298 // Before going any further, make sure that this pair does not 1299 // conflict with any already-selected pairs (see comment below 1300 // near the Tree pruning for more details). 1301 DenseSet<ValuePair> ChosenPairSet; 1302 bool DoesConflict = false; 1303 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1304 E = ChosenPairs.end(); C != E; ++C) { 1305 if (pairsConflict(*C, *J, PairableInstUsers, 1306 UseCycleCheck ? &PairableInstUserMap : 0)) { 1307 DoesConflict = true; 1308 break; 1309 } 1310 1311 ChosenPairSet.insert(*C); 1312 } 1313 if (DoesConflict) continue; 1314 1315 if (UseCycleCheck && 1316 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) 1317 continue; 1318 1319 DenseMap<ValuePair, size_t> Tree; 1320 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1321 PairableInstUsers, ChosenPairs, Tree, *J); 1322 1323 // Because we'll keep the child with the largest depth, the largest 1324 // depth is still the same in the unpruned Tree. 1325 size_t MaxDepth = Tree.lookup(*J); 1326 1327 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1328 << *J->first << " <-> " << *J->second << "} of depth " << 1329 MaxDepth << " and size " << Tree.size() << "\n"); 1330 1331 // At this point the Tree has been constructed, but, may contain 1332 // contradictory children (meaning that different children of 1333 // some tree node may be attempting to fuse the same instruction). 1334 // So now we walk the tree again, in the case of a conflict, 1335 // keep only the child with the largest depth. To break a tie, 1336 // favor the first child. 1337 1338 DenseSet<ValuePair> PrunedTree; 1339 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1340 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, 1341 PrunedTree, *J, UseCycleCheck); 1342 1343 size_t EffSize = 0; 1344 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1345 E = PrunedTree.end(); S != E; ++S) 1346 EffSize += getDepthFactor(S->first); 1347 1348 DEBUG(if (DebugPairSelection) 1349 dbgs() << "BBV: found pruned Tree for pair {" 1350 << *J->first << " <-> " << *J->second << "} of depth " << 1351 MaxDepth << " and size " << PrunedTree.size() << 1352 " (effective size: " << EffSize << ")\n"); 1353 if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { 1354 BestMaxDepth = MaxDepth; 1355 BestEffSize = EffSize; 1356 BestTree = PrunedTree; 1357 } 1358 } 1359 } 1360 1361 // Given the list of candidate pairs, this function selects those 1362 // that will be fused into vector instructions. 1363 void BBVectorize::choosePairs( 1364 std::multimap<Value *, Value *> &CandidatePairs, 1365 std::vector<Value *> &PairableInsts, 1366 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1367 DenseSet<ValuePair> &PairableInstUsers, 1368 DenseMap<Value *, Value *>& ChosenPairs) { 1369 bool UseCycleCheck = 1370 CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; 1371 std::multimap<ValuePair, ValuePair> PairableInstUserMap; 1372 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 1373 E = PairableInsts.end(); I != E; ++I) { 1374 // The number of possible pairings for this variable: 1375 size_t NumChoices = CandidatePairs.count(*I); 1376 if (!NumChoices) continue; 1377 1378 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); 1379 1380 // The best pair to choose and its tree: 1381 size_t BestMaxDepth = 0, BestEffSize = 0; 1382 DenseSet<ValuePair> BestTree; 1383 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1384 PairableInstUsers, PairableInstUserMap, ChosenPairs, 1385 BestTree, BestMaxDepth, BestEffSize, ChoiceRange, 1386 UseCycleCheck); 1387 1388 // A tree has been chosen (or not) at this point. If no tree was 1389 // chosen, then this instruction, I, cannot be paired (and is no longer 1390 // considered). 1391 1392 DEBUG(if (BestTree.size() > 0) 1393 dbgs() << "BBV: selected pairs in the best tree for: " 1394 << *cast<Instruction>(*I) << "\n"); 1395 1396 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 1397 SE2 = BestTree.end(); S != SE2; ++S) { 1398 // Insert the members of this tree into the list of chosen pairs. 1399 ChosenPairs.insert(ValuePair(S->first, S->second)); 1400 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 1401 *S->second << "\n"); 1402 1403 // Remove all candidate pairs that have values in the chosen tree. 1404 for (std::multimap<Value *, Value *>::iterator K = 1405 CandidatePairs.begin(); K != CandidatePairs.end();) { 1406 if (K->first == S->first || K->second == S->first || 1407 K->second == S->second || K->first == S->second) { 1408 // Don't remove the actual pair chosen so that it can be used 1409 // in subsequent tree selections. 1410 if (!(K->first == S->first && K->second == S->second)) 1411 CandidatePairs.erase(K++); 1412 else 1413 ++K; 1414 } else { 1415 ++K; 1416 } 1417 } 1418 } 1419 } 1420 1421 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 1422 } 1423 1424 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 1425 unsigned n = 0) { 1426 if (!I->hasName()) 1427 return ""; 1428 1429 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 1430 (n > 0 ? "." + utostr(n) : "")).str(); 1431 } 1432 1433 // Returns the value that is to be used as the pointer input to the vector 1434 // instruction that fuses I with J. 1435 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 1436 Instruction *I, Instruction *J, unsigned o, 1437 bool &FlipMemInputs) { 1438 Value *IPtr, *JPtr; 1439 unsigned IAlignment, JAlignment; 1440 int64_t OffsetInElmts; 1441 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 1442 OffsetInElmts); 1443 1444 // The pointer value is taken to be the one with the lowest offset. 1445 Value *VPtr; 1446 if (OffsetInElmts > 0) { 1447 VPtr = IPtr; 1448 } else { 1449 FlipMemInputs = true; 1450 VPtr = JPtr; 1451 } 1452 1453 Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); 1454 Type *VArgType = getVecTypeForPair(ArgType); 1455 Type *VArgPtrType = PointerType::get(VArgType, 1456 cast<PointerType>(IPtr->getType())->getAddressSpace()); 1457 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 1458 /* insert before */ FlipMemInputs ? J : I); 1459 } 1460 1461 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 1462 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 1463 unsigned IdxOffset, std::vector<Constant*> &Mask) { 1464 for (unsigned v = 0; v < NumElem/2; ++v) { 1465 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 1466 if (m < 0) { 1467 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 1468 } else { 1469 unsigned mm = m + (int) IdxOffset; 1470 if (m >= (int) NumInElem) 1471 mm += (int) NumInElem; 1472 1473 Mask[v+MaskOffset] = 1474 ConstantInt::get(Type::getInt32Ty(Context), mm); 1475 } 1476 } 1477 } 1478 1479 // Returns the value that is to be used as the vector-shuffle mask to the 1480 // vector instruction that fuses I with J. 1481 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 1482 Instruction *I, Instruction *J) { 1483 // This is the shuffle mask. We need to append the second 1484 // mask to the first, and the numbers need to be adjusted. 1485 1486 Type *ArgType = I->getType(); 1487 Type *VArgType = getVecTypeForPair(ArgType); 1488 1489 // Get the total number of elements in the fused vector type. 1490 // By definition, this must equal the number of elements in 1491 // the final mask. 1492 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 1493 std::vector<Constant*> Mask(NumElem); 1494 1495 Type *OpType = I->getOperand(0)->getType(); 1496 unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); 1497 1498 // For the mask from the first pair... 1499 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); 1500 1501 // For the mask from the second pair... 1502 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, 1503 Mask); 1504 1505 return ConstantVector::get(Mask); 1506 } 1507 1508 // Returns the value to be used as the specified operand of the vector 1509 // instruction that fuses I with J. 1510 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 1511 Instruction *J, unsigned o, bool FlipMemInputs) { 1512 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1513 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1514 1515 // Compute the fused vector type for this operand 1516 Type *ArgType = I->getOperand(o)->getType(); 1517 VectorType *VArgType = getVecTypeForPair(ArgType); 1518 1519 Instruction *L = I, *H = J; 1520 if (FlipMemInputs) { 1521 L = J; 1522 H = I; 1523 } 1524 1525 if (ArgType->isVectorTy()) { 1526 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 1527 std::vector<Constant*> Mask(numElem); 1528 for (unsigned v = 0; v < numElem; ++v) 1529 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1530 1531 Instruction *BV = new ShuffleVectorInst(L->getOperand(o), 1532 H->getOperand(o), 1533 ConstantVector::get(Mask), 1534 getReplacementName(I, true, o)); 1535 BV->insertBefore(J); 1536 return BV; 1537 } 1538 1539 // If these two inputs are the output of another vector instruction, 1540 // then we should use that output directly. It might be necessary to 1541 // permute it first. [When pairings are fused recursively, you can 1542 // end up with cases where a large vector is decomposed into scalars 1543 // using extractelement instructions, then built into size-2 1544 // vectors using insertelement and the into larger vectors using 1545 // shuffles. InstCombine does not simplify all of these cases well, 1546 // and so we make sure that shuffles are generated here when possible. 1547 ExtractElementInst *LEE 1548 = dyn_cast<ExtractElementInst>(L->getOperand(o)); 1549 ExtractElementInst *HEE 1550 = dyn_cast<ExtractElementInst>(H->getOperand(o)); 1551 1552 if (LEE && HEE && 1553 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { 1554 VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); 1555 unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); 1556 unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); 1557 if (LEE->getOperand(0) == HEE->getOperand(0)) { 1558 if (LowIndx == 0 && HighIndx == 1) 1559 return LEE->getOperand(0); 1560 1561 std::vector<Constant*> Mask(2); 1562 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1563 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1564 1565 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1566 UndefValue::get(EEType), 1567 ConstantVector::get(Mask), 1568 getReplacementName(I, true, o)); 1569 BV->insertBefore(J); 1570 return BV; 1571 } 1572 1573 std::vector<Constant*> Mask(2); 1574 HighIndx += EEType->getNumElements(); 1575 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1576 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1577 1578 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1579 HEE->getOperand(0), 1580 ConstantVector::get(Mask), 1581 getReplacementName(I, true, o)); 1582 BV->insertBefore(J); 1583 return BV; 1584 } 1585 1586 Instruction *BV1 = InsertElementInst::Create( 1587 UndefValue::get(VArgType), 1588 L->getOperand(o), CV0, 1589 getReplacementName(I, true, o, 1)); 1590 BV1->insertBefore(I); 1591 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), 1592 CV1, 1593 getReplacementName(I, true, o, 2)); 1594 BV2->insertBefore(J); 1595 return BV2; 1596 } 1597 1598 // This function creates an array of values that will be used as the inputs 1599 // to the vector instruction that fuses I with J. 1600 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 1601 Instruction *I, Instruction *J, 1602 SmallVector<Value *, 3> &ReplacedOperands, 1603 bool &FlipMemInputs) { 1604 FlipMemInputs = false; 1605 unsigned NumOperands = I->getNumOperands(); 1606 1607 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 1608 // Iterate backward so that we look at the store pointer 1609 // first and know whether or not we need to flip the inputs. 1610 1611 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 1612 // This is the pointer for a load/store instruction. 1613 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, 1614 FlipMemInputs); 1615 continue; 1616 } else if (isa<CallInst>(I)) { 1617 Function *F = cast<CallInst>(I)->getCalledFunction(); 1618 unsigned IID = F->getIntrinsicID(); 1619 if (o == NumOperands-1) { 1620 BasicBlock &BB = *I->getParent(); 1621 1622 Module *M = BB.getParent()->getParent(); 1623 Type *ArgType = I->getType(); 1624 Type *VArgType = getVecTypeForPair(ArgType); 1625 1626 // FIXME: is it safe to do this here? 1627 ReplacedOperands[o] = Intrinsic::getDeclaration(M, 1628 (Intrinsic::ID) IID, VArgType); 1629 continue; 1630 } else if (IID == Intrinsic::powi && o == 1) { 1631 // The second argument of powi is a single integer and we've already 1632 // checked that both arguments are equal. As a result, we just keep 1633 // I's second argument. 1634 ReplacedOperands[o] = I->getOperand(o); 1635 continue; 1636 } 1637 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 1638 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 1639 continue; 1640 } 1641 1642 ReplacedOperands[o] = 1643 getReplacementInput(Context, I, J, o, FlipMemInputs); 1644 } 1645 } 1646 1647 // This function creates two values that represent the outputs of the 1648 // original I and J instructions. These are generally vector shuffles 1649 // or extracts. In many cases, these will end up being unused and, thus, 1650 // eliminated by later passes. 1651 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 1652 Instruction *J, Instruction *K, 1653 Instruction *&InsertionPt, 1654 Instruction *&K1, Instruction *&K2, 1655 bool &FlipMemInputs) { 1656 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1657 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1658 1659 if (isa<StoreInst>(I)) { 1660 AA->replaceWithNewValue(I, K); 1661 AA->replaceWithNewValue(J, K); 1662 } else { 1663 Type *IType = I->getType(); 1664 Type *VType = getVecTypeForPair(IType); 1665 1666 if (IType->isVectorTy()) { 1667 unsigned numElem = cast<VectorType>(IType)->getNumElements(); 1668 std::vector<Constant*> Mask1(numElem), Mask2(numElem); 1669 for (unsigned v = 0; v < numElem; ++v) { 1670 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1671 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); 1672 } 1673 1674 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 1675 ConstantVector::get( 1676 FlipMemInputs ? Mask2 : Mask1), 1677 getReplacementName(K, false, 1)); 1678 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 1679 ConstantVector::get( 1680 FlipMemInputs ? Mask1 : Mask2), 1681 getReplacementName(K, false, 2)); 1682 } else { 1683 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, 1684 getReplacementName(K, false, 1)); 1685 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, 1686 getReplacementName(K, false, 2)); 1687 } 1688 1689 K1->insertAfter(K); 1690 K2->insertAfter(K1); 1691 InsertionPt = K2; 1692 } 1693 } 1694 1695 // Move all uses of the function I (including pairing-induced uses) after J. 1696 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 1697 std::multimap<Value *, Value *> &LoadMoveSet, 1698 Instruction *I, Instruction *J) { 1699 // Skip to the first instruction past I. 1700 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1701 1702 DenseSet<Value *> Users; 1703 AliasSetTracker WriteSet(*AA); 1704 for (; cast<Instruction>(L) != J; ++L) 1705 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); 1706 1707 assert(cast<Instruction>(L) == J && 1708 "Tracking has not proceeded far enough to check for dependencies"); 1709 // If J is now in the use set of I, then trackUsesOfI will return true 1710 // and we have a dependency cycle (and the fusing operation must abort). 1711 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); 1712 } 1713 1714 // Move all uses of the function I (including pairing-induced uses) after J. 1715 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 1716 std::multimap<Value *, Value *> &LoadMoveSet, 1717 Instruction *&InsertionPt, 1718 Instruction *I, Instruction *J) { 1719 // Skip to the first instruction past I. 1720 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1721 1722 DenseSet<Value *> Users; 1723 AliasSetTracker WriteSet(*AA); 1724 for (; cast<Instruction>(L) != J;) { 1725 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { 1726 // Move this instruction 1727 Instruction *InstToMove = L; ++L; 1728 1729 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 1730 " to after " << *InsertionPt << "\n"); 1731 InstToMove->removeFromParent(); 1732 InstToMove->insertAfter(InsertionPt); 1733 InsertionPt = InstToMove; 1734 } else { 1735 ++L; 1736 } 1737 } 1738 } 1739 1740 // Collect all load instruction that are in the move set of a given first 1741 // pair member. These loads depend on the first instruction, I, and so need 1742 // to be moved after J (the second instruction) when the pair is fused. 1743 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 1744 DenseMap<Value *, Value *> &ChosenPairs, 1745 std::multimap<Value *, Value *> &LoadMoveSet, 1746 Instruction *I) { 1747 // Skip to the first instruction past I. 1748 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1749 1750 DenseSet<Value *> Users; 1751 AliasSetTracker WriteSet(*AA); 1752 1753 // Note: We cannot end the loop when we reach J because J could be moved 1754 // farther down the use chain by another instruction pairing. Also, J 1755 // could be before I if this is an inverted input. 1756 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 1757 if (trackUsesOfI(Users, WriteSet, I, L)) { 1758 if (L->mayReadFromMemory()) 1759 LoadMoveSet.insert(ValuePair(L, I)); 1760 } 1761 } 1762 } 1763 1764 // In cases where both load/stores and the computation of their pointers 1765 // are chosen for vectorization, we can end up in a situation where the 1766 // aliasing analysis starts returning different query results as the 1767 // process of fusing instruction pairs continues. Because the algorithm 1768 // relies on finding the same use trees here as were found earlier, we'll 1769 // need to precompute the necessary aliasing information here and then 1770 // manually update it during the fusion process. 1771 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 1772 std::vector<Value *> &PairableInsts, 1773 DenseMap<Value *, Value *> &ChosenPairs, 1774 std::multimap<Value *, Value *> &LoadMoveSet) { 1775 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1776 PIE = PairableInsts.end(); PI != PIE; ++PI) { 1777 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 1778 if (P == ChosenPairs.end()) continue; 1779 1780 Instruction *I = cast<Instruction>(P->first); 1781 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); 1782 } 1783 } 1784 1785 // When the first instruction in each pair is cloned, it will inherit its 1786 // parent's metadata. This metadata must be combined with that of the other 1787 // instruction in a safe way. 1788 void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { 1789 SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; 1790 K->getAllMetadataOtherThanDebugLoc(Metadata); 1791 for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { 1792 unsigned Kind = Metadata[i].first; 1793 MDNode *JMD = J->getMetadata(Kind); 1794 MDNode *KMD = Metadata[i].second; 1795 1796 switch (Kind) { 1797 default: 1798 K->setMetadata(Kind, 0); // Remove unknown metadata 1799 break; 1800 case LLVMContext::MD_tbaa: 1801 K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 1802 break; 1803 case LLVMContext::MD_fpmath: 1804 K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 1805 break; 1806 } 1807 } 1808 } 1809 1810 // This function fuses the chosen instruction pairs into vector instructions, 1811 // taking care preserve any needed scalar outputs and, then, it reorders the 1812 // remaining instructions as needed (users of the first member of the pair 1813 // need to be moved to after the location of the second member of the pair 1814 // because the vector instruction is inserted in the location of the pair's 1815 // second member). 1816 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 1817 std::vector<Value *> &PairableInsts, 1818 DenseMap<Value *, Value *> &ChosenPairs) { 1819 LLVMContext& Context = BB.getContext(); 1820 1821 // During the vectorization process, the order of the pairs to be fused 1822 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 1823 // list. After a pair is fused, the flipped pair is removed from the list. 1824 std::vector<ValuePair> FlippedPairs; 1825 FlippedPairs.reserve(ChosenPairs.size()); 1826 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 1827 E = ChosenPairs.end(); P != E; ++P) 1828 FlippedPairs.push_back(ValuePair(P->second, P->first)); 1829 for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), 1830 E = FlippedPairs.end(); P != E; ++P) 1831 ChosenPairs.insert(*P); 1832 1833 std::multimap<Value *, Value *> LoadMoveSet; 1834 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); 1835 1836 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 1837 1838 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 1839 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 1840 if (P == ChosenPairs.end()) { 1841 ++PI; 1842 continue; 1843 } 1844 1845 if (getDepthFactor(P->first) == 0) { 1846 // These instructions are not really fused, but are tracked as though 1847 // they are. Any case in which it would be interesting to fuse them 1848 // will be taken care of by InstCombine. 1849 --NumFusedOps; 1850 ++PI; 1851 continue; 1852 } 1853 1854 Instruction *I = cast<Instruction>(P->first), 1855 *J = cast<Instruction>(P->second); 1856 1857 DEBUG(dbgs() << "BBV: fusing: " << *I << 1858 " <-> " << *J << "\n"); 1859 1860 // Remove the pair and flipped pair from the list. 1861 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 1862 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 1863 ChosenPairs.erase(FP); 1864 ChosenPairs.erase(P); 1865 1866 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { 1867 DEBUG(dbgs() << "BBV: fusion of: " << *I << 1868 " <-> " << *J << 1869 " aborted because of non-trivial dependency cycle\n"); 1870 --NumFusedOps; 1871 ++PI; 1872 continue; 1873 } 1874 1875 bool FlipMemInputs; 1876 unsigned NumOperands = I->getNumOperands(); 1877 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 1878 getReplacementInputsForPair(Context, I, J, ReplacedOperands, 1879 FlipMemInputs); 1880 1881 // Make a copy of the original operation, change its type to the vector 1882 // type and replace its operands with the vector operands. 1883 Instruction *K = I->clone(); 1884 if (I->hasName()) K->takeName(I); 1885 1886 if (!isa<StoreInst>(K)) 1887 K->mutateType(getVecTypeForPair(I->getType())); 1888 1889 combineMetadata(K, J); 1890 1891 for (unsigned o = 0; o < NumOperands; ++o) 1892 K->setOperand(o, ReplacedOperands[o]); 1893 1894 // If we've flipped the memory inputs, make sure that we take the correct 1895 // alignment. 1896 if (FlipMemInputs) { 1897 if (isa<StoreInst>(K)) 1898 cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); 1899 else 1900 cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); 1901 } 1902 1903 K->insertAfter(J); 1904 1905 // Instruction insertion point: 1906 Instruction *InsertionPt = K; 1907 Instruction *K1 = 0, *K2 = 0; 1908 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, 1909 FlipMemInputs); 1910 1911 // The use tree of the first original instruction must be moved to after 1912 // the location of the second instruction. The entire use tree of the 1913 // first instruction is disjoint from the input tree of the second 1914 // (by definition), and so commutes with it. 1915 1916 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); 1917 1918 if (!isa<StoreInst>(I)) { 1919 I->replaceAllUsesWith(K1); 1920 J->replaceAllUsesWith(K2); 1921 AA->replaceWithNewValue(I, K1); 1922 AA->replaceWithNewValue(J, K2); 1923 } 1924 1925 // Instructions that may read from memory may be in the load move set. 1926 // Once an instruction is fused, we no longer need its move set, and so 1927 // the values of the map never need to be updated. However, when a load 1928 // is fused, we need to merge the entries from both instructions in the 1929 // pair in case those instructions were in the move set of some other 1930 // yet-to-be-fused pair. The loads in question are the keys of the map. 1931 if (I->mayReadFromMemory()) { 1932 std::vector<ValuePair> NewSetMembers; 1933 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); 1934 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); 1935 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; 1936 N != IPairRange.second; ++N) 1937 NewSetMembers.push_back(ValuePair(K, N->second)); 1938 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; 1939 N != JPairRange.second; ++N) 1940 NewSetMembers.push_back(ValuePair(K, N->second)); 1941 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 1942 AE = NewSetMembers.end(); A != AE; ++A) 1943 LoadMoveSet.insert(*A); 1944 } 1945 1946 // Before removing I, set the iterator to the next instruction. 1947 PI = llvm::next(BasicBlock::iterator(I)); 1948 if (cast<Instruction>(PI) == J) 1949 ++PI; 1950 1951 SE->forgetValue(I); 1952 SE->forgetValue(J); 1953 I->eraseFromParent(); 1954 J->eraseFromParent(); 1955 } 1956 1957 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 1958 } 1959} 1960 1961char BBVectorize::ID = 0; 1962static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 1963INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1964INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1965INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1966INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1967 1968BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 1969 return new BBVectorize(C); 1970} 1971 1972bool 1973llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 1974 BBVectorize BBVectorizer(P, C); 1975 return BBVectorizer.vectorizeBB(BB); 1976} 1977 1978//===----------------------------------------------------------------------===// 1979VectorizeConfig::VectorizeConfig() { 1980 VectorBits = ::VectorBits; 1981 VectorizeBools = !::NoBools; 1982 VectorizeInts = !::NoInts; 1983 VectorizeFloats = !::NoFloats; 1984 VectorizePointers = !::NoPointers; 1985 VectorizeCasts = !::NoCasts; 1986 VectorizeMath = !::NoMath; 1987 VectorizeFMA = !::NoFMA; 1988 VectorizeSelect = !::NoSelect; 1989 VectorizeCmp = !::NoCmp; 1990 VectorizeGEP = !::NoGEP; 1991 VectorizeMemOps = !::NoMemOps; 1992 AlignedOnly = ::AlignedOnly; 1993 ReqChainDepth= ::ReqChainDepth; 1994 SearchLimit = ::SearchLimit; 1995 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 1996 SplatBreaksChain = ::SplatBreaksChain; 1997 MaxInsts = ::MaxInsts; 1998 MaxIter = ::MaxIter; 1999 NoMemOpBoost = ::NoMemOpBoost; 2000 FastDep = ::FastDep; 2001} 2002