BBVectorize.cpp revision 768edf3cd037aab10391abc279f71470df8e3156
1//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a basic-block vectorization pass. The algorithm was 11// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12// et al. It works by looking for chains of pairable operations and then 13// pairing them. 14// 15//===----------------------------------------------------------------------===// 16 17#define BBV_NAME "bb-vectorize" 18#define DEBUG_TYPE BBV_NAME 19#include "llvm/Constants.h" 20#include "llvm/DerivedTypes.h" 21#include "llvm/Function.h" 22#include "llvm/Instructions.h" 23#include "llvm/IntrinsicInst.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/LLVMContext.h" 26#include "llvm/Metadata.h" 27#include "llvm/Pass.h" 28#include "llvm/Type.h" 29#include "llvm/ADT/DenseMap.h" 30#include "llvm/ADT/DenseSet.h" 31#include "llvm/ADT/SmallVector.h" 32#include "llvm/ADT/Statistic.h" 33#include "llvm/ADT/STLExtras.h" 34#include "llvm/ADT/StringExtras.h" 35#include "llvm/Analysis/AliasAnalysis.h" 36#include "llvm/Analysis/AliasSetTracker.h" 37#include "llvm/Analysis/ScalarEvolution.h" 38#include "llvm/Analysis/ScalarEvolutionExpressions.h" 39#include "llvm/Analysis/ValueTracking.h" 40#include "llvm/Support/CommandLine.h" 41#include "llvm/Support/Debug.h" 42#include "llvm/Support/raw_ostream.h" 43#include "llvm/Support/ValueHandle.h" 44#include "llvm/Target/TargetData.h" 45#include "llvm/Transforms/Vectorize.h" 46#include <algorithm> 47#include <map> 48using namespace llvm; 49 50static cl::opt<unsigned> 51ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 52 cl::desc("The required chain depth for vectorization")); 53 54static cl::opt<unsigned> 55SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 56 cl::desc("The maximum search distance for instruction pairs")); 57 58static cl::opt<bool> 59SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 60 cl::desc("Replicating one element to a pair breaks the chain")); 61 62static cl::opt<unsigned> 63VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 64 cl::desc("The size of the native vector registers")); 65 66static cl::opt<unsigned> 67MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 68 cl::desc("The maximum number of pairing iterations")); 69 70static cl::opt<unsigned> 71MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 72 cl::desc("The maximum number of pairable instructions per group")); 73 74static cl::opt<unsigned> 75MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 76 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 77 " a full cycle check")); 78 79static cl::opt<bool> 80NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 81 cl::desc("Don't try to vectorize boolean (i1) values")); 82 83static cl::opt<bool> 84NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 85 cl::desc("Don't try to vectorize integer values")); 86 87static cl::opt<bool> 88NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 89 cl::desc("Don't try to vectorize floating-point values")); 90 91static cl::opt<bool> 92NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, 93 cl::desc("Don't try to vectorize pointer values")); 94 95static cl::opt<bool> 96NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 97 cl::desc("Don't try to vectorize casting (conversion) operations")); 98 99static cl::opt<bool> 100NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 101 cl::desc("Don't try to vectorize floating-point math intrinsics")); 102 103static cl::opt<bool> 104NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 105 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 106 107static cl::opt<bool> 108NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 109 cl::desc("Don't try to vectorize select instructions")); 110 111static cl::opt<bool> 112NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 113 cl::desc("Don't try to vectorize comparison instructions")); 114 115static cl::opt<bool> 116NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 117 cl::desc("Don't try to vectorize getelementptr instructions")); 118 119static cl::opt<bool> 120NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 121 cl::desc("Don't try to vectorize loads and stores")); 122 123static cl::opt<bool> 124AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 125 cl::desc("Only generate aligned loads and stores")); 126 127static cl::opt<bool> 128NoMemOpBoost("bb-vectorize-no-mem-op-boost", 129 cl::init(false), cl::Hidden, 130 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 131 132static cl::opt<bool> 133FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 134 cl::desc("Use a fast instruction dependency analysis")); 135 136#ifndef NDEBUG 137static cl::opt<bool> 138DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 139 cl::init(false), cl::Hidden, 140 cl::desc("When debugging is enabled, output information on the" 141 " instruction-examination process")); 142static cl::opt<bool> 143DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 144 cl::init(false), cl::Hidden, 145 cl::desc("When debugging is enabled, output information on the" 146 " candidate-selection process")); 147static cl::opt<bool> 148DebugPairSelection("bb-vectorize-debug-pair-selection", 149 cl::init(false), cl::Hidden, 150 cl::desc("When debugging is enabled, output information on the" 151 " pair-selection process")); 152static cl::opt<bool> 153DebugCycleCheck("bb-vectorize-debug-cycle-check", 154 cl::init(false), cl::Hidden, 155 cl::desc("When debugging is enabled, output information on the" 156 " cycle-checking process")); 157#endif 158 159STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 160 161namespace { 162 struct BBVectorize : public BasicBlockPass { 163 static char ID; // Pass identification, replacement for typeid 164 165 const VectorizeConfig Config; 166 167 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 168 : BasicBlockPass(ID), Config(C) { 169 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 170 } 171 172 BBVectorize(Pass *P, const VectorizeConfig &C) 173 : BasicBlockPass(ID), Config(C) { 174 AA = &P->getAnalysis<AliasAnalysis>(); 175 SE = &P->getAnalysis<ScalarEvolution>(); 176 TD = P->getAnalysisIfAvailable<TargetData>(); 177 } 178 179 typedef std::pair<Value *, Value *> ValuePair; 180 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 181 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 182 typedef std::pair<std::multimap<Value *, Value *>::iterator, 183 std::multimap<Value *, Value *>::iterator> VPIteratorPair; 184 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, 185 std::multimap<ValuePair, ValuePair>::iterator> 186 VPPIteratorPair; 187 188 AliasAnalysis *AA; 189 ScalarEvolution *SE; 190 TargetData *TD; 191 192 // FIXME: const correct? 193 194 bool vectorizePairs(BasicBlock &BB); 195 196 bool getCandidatePairs(BasicBlock &BB, 197 BasicBlock::iterator &Start, 198 std::multimap<Value *, Value *> &CandidatePairs, 199 std::vector<Value *> &PairableInsts); 200 201 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, 202 std::vector<Value *> &PairableInsts, 203 std::multimap<ValuePair, ValuePair> &ConnectedPairs); 204 205 void buildDepMap(BasicBlock &BB, 206 std::multimap<Value *, Value *> &CandidatePairs, 207 std::vector<Value *> &PairableInsts, 208 DenseSet<ValuePair> &PairableInstUsers); 209 210 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, 211 std::vector<Value *> &PairableInsts, 212 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 213 DenseSet<ValuePair> &PairableInstUsers, 214 DenseMap<Value *, Value *>& ChosenPairs); 215 216 void fuseChosenPairs(BasicBlock &BB, 217 std::vector<Value *> &PairableInsts, 218 DenseMap<Value *, Value *>& ChosenPairs); 219 220 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 221 222 bool areInstsCompatible(Instruction *I, Instruction *J, 223 bool IsSimpleLoadStore); 224 225 bool trackUsesOfI(DenseSet<Value *> &Users, 226 AliasSetTracker &WriteSet, Instruction *I, 227 Instruction *J, bool UpdateUsers = true, 228 std::multimap<Value *, Value *> *LoadMoveSet = 0); 229 230 void computePairsConnectedTo( 231 std::multimap<Value *, Value *> &CandidatePairs, 232 std::vector<Value *> &PairableInsts, 233 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 234 ValuePair P); 235 236 bool pairsConflict(ValuePair P, ValuePair Q, 237 DenseSet<ValuePair> &PairableInstUsers, 238 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); 239 240 bool pairWillFormCycle(ValuePair P, 241 std::multimap<ValuePair, ValuePair> &PairableInstUsers, 242 DenseSet<ValuePair> &CurrentPairs); 243 244 void pruneTreeFor( 245 std::multimap<Value *, Value *> &CandidatePairs, 246 std::vector<Value *> &PairableInsts, 247 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 248 DenseSet<ValuePair> &PairableInstUsers, 249 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 250 DenseMap<Value *, Value *> &ChosenPairs, 251 DenseMap<ValuePair, size_t> &Tree, 252 DenseSet<ValuePair> &PrunedTree, ValuePair J, 253 bool UseCycleCheck); 254 255 void buildInitialTreeFor( 256 std::multimap<Value *, Value *> &CandidatePairs, 257 std::vector<Value *> &PairableInsts, 258 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 259 DenseSet<ValuePair> &PairableInstUsers, 260 DenseMap<Value *, Value *> &ChosenPairs, 261 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 262 263 void findBestTreeFor( 264 std::multimap<Value *, Value *> &CandidatePairs, 265 std::vector<Value *> &PairableInsts, 266 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 267 DenseSet<ValuePair> &PairableInstUsers, 268 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 269 DenseMap<Value *, Value *> &ChosenPairs, 270 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 271 size_t &BestEffSize, VPIteratorPair ChoiceRange, 272 bool UseCycleCheck); 273 274 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 275 Instruction *J, unsigned o, bool &FlipMemInputs); 276 277 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 278 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 279 unsigned IdxOffset, std::vector<Constant*> &Mask); 280 281 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 282 Instruction *J); 283 284 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 285 Instruction *J, unsigned o, bool FlipMemInputs); 286 287 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 288 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 289 bool &FlipMemInputs); 290 291 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 292 Instruction *J, Instruction *K, 293 Instruction *&InsertionPt, Instruction *&K1, 294 Instruction *&K2, bool &FlipMemInputs); 295 296 void collectPairLoadMoveSet(BasicBlock &BB, 297 DenseMap<Value *, Value *> &ChosenPairs, 298 std::multimap<Value *, Value *> &LoadMoveSet, 299 Instruction *I); 300 301 void collectLoadMoveSet(BasicBlock &BB, 302 std::vector<Value *> &PairableInsts, 303 DenseMap<Value *, Value *> &ChosenPairs, 304 std::multimap<Value *, Value *> &LoadMoveSet); 305 306 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 307 std::multimap<Value *, Value *> &LoadMoveSet, 308 Instruction *I, Instruction *J); 309 310 void moveUsesOfIAfterJ(BasicBlock &BB, 311 std::multimap<Value *, Value *> &LoadMoveSet, 312 Instruction *&InsertionPt, 313 Instruction *I, Instruction *J); 314 315 void combineMetadata(Instruction *K, const Instruction *J); 316 317 bool vectorizeBB(BasicBlock &BB) { 318 bool changed = false; 319 // Iterate a sufficient number of times to merge types of size 1 bit, 320 // then 2 bits, then 4, etc. up to half of the target vector width of the 321 // target vector register. 322 for (unsigned v = 2, n = 1; 323 v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); 324 v *= 2, ++n) { 325 DEBUG(dbgs() << "BBV: fusing loop #" << n << 326 " for " << BB.getName() << " in " << 327 BB.getParent()->getName() << "...\n"); 328 if (vectorizePairs(BB)) 329 changed = true; 330 else 331 break; 332 } 333 334 DEBUG(dbgs() << "BBV: done!\n"); 335 return changed; 336 } 337 338 virtual bool runOnBasicBlock(BasicBlock &BB) { 339 AA = &getAnalysis<AliasAnalysis>(); 340 SE = &getAnalysis<ScalarEvolution>(); 341 TD = getAnalysisIfAvailable<TargetData>(); 342 343 return vectorizeBB(BB); 344 } 345 346 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 347 BasicBlockPass::getAnalysisUsage(AU); 348 AU.addRequired<AliasAnalysis>(); 349 AU.addRequired<ScalarEvolution>(); 350 AU.addPreserved<AliasAnalysis>(); 351 AU.addPreserved<ScalarEvolution>(); 352 AU.setPreservesCFG(); 353 } 354 355 // This returns the vector type that holds a pair of the provided type. 356 // If the provided type is already a vector, then its length is doubled. 357 static inline VectorType *getVecTypeForPair(Type *ElemTy) { 358 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 359 unsigned numElem = VTy->getNumElements(); 360 return VectorType::get(ElemTy->getScalarType(), numElem*2); 361 } 362 363 return VectorType::get(ElemTy, 2); 364 } 365 366 // Returns the weight associated with the provided value. A chain of 367 // candidate pairs has a length given by the sum of the weights of its 368 // members (one weight per pair; the weight of each member of the pair 369 // is assumed to be the same). This length is then compared to the 370 // chain-length threshold to determine if a given chain is significant 371 // enough to be vectorized. The length is also used in comparing 372 // candidate chains where longer chains are considered to be better. 373 // Note: when this function returns 0, the resulting instructions are 374 // not actually fused. 375 inline size_t getDepthFactor(Value *V) { 376 // InsertElement and ExtractElement have a depth factor of zero. This is 377 // for two reasons: First, they cannot be usefully fused. Second, because 378 // the pass generates a lot of these, they can confuse the simple metric 379 // used to compare the trees in the next iteration. Thus, giving them a 380 // weight of zero allows the pass to essentially ignore them in 381 // subsequent iterations when looking for vectorization opportunities 382 // while still tracking dependency chains that flow through those 383 // instructions. 384 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 385 return 0; 386 387 // Give a load or store half of the required depth so that load/store 388 // pairs will vectorize. 389 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 390 return Config.ReqChainDepth/2; 391 392 return 1; 393 } 394 395 // This determines the relative offset of two loads or stores, returning 396 // true if the offset could be determined to be some constant value. 397 // For example, if OffsetInElmts == 1, then J accesses the memory directly 398 // after I; if OffsetInElmts == -1 then I accesses the memory 399 // directly after J. This function assumes that both instructions 400 // have the same type. 401 bool getPairPtrInfo(Instruction *I, Instruction *J, 402 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 403 int64_t &OffsetInElmts) { 404 OffsetInElmts = 0; 405 if (isa<LoadInst>(I)) { 406 IPtr = cast<LoadInst>(I)->getPointerOperand(); 407 JPtr = cast<LoadInst>(J)->getPointerOperand(); 408 IAlignment = cast<LoadInst>(I)->getAlignment(); 409 JAlignment = cast<LoadInst>(J)->getAlignment(); 410 } else { 411 IPtr = cast<StoreInst>(I)->getPointerOperand(); 412 JPtr = cast<StoreInst>(J)->getPointerOperand(); 413 IAlignment = cast<StoreInst>(I)->getAlignment(); 414 JAlignment = cast<StoreInst>(J)->getAlignment(); 415 } 416 417 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 418 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 419 420 // If this is a trivial offset, then we'll get something like 421 // 1*sizeof(type). With target data, which we need anyway, this will get 422 // constant folded into a number. 423 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 424 if (const SCEVConstant *ConstOffSCEV = 425 dyn_cast<SCEVConstant>(OffsetSCEV)) { 426 ConstantInt *IntOff = ConstOffSCEV->getValue(); 427 int64_t Offset = IntOff->getSExtValue(); 428 429 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 430 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 431 432 assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); 433 434 OffsetInElmts = Offset/VTyTSS; 435 return (abs64(Offset) % VTyTSS) == 0; 436 } 437 438 return false; 439 } 440 441 // Returns true if the provided CallInst represents an intrinsic that can 442 // be vectorized. 443 bool isVectorizableIntrinsic(CallInst* I) { 444 Function *F = I->getCalledFunction(); 445 if (!F) return false; 446 447 unsigned IID = F->getIntrinsicID(); 448 if (!IID) return false; 449 450 switch(IID) { 451 default: 452 return false; 453 case Intrinsic::sqrt: 454 case Intrinsic::powi: 455 case Intrinsic::sin: 456 case Intrinsic::cos: 457 case Intrinsic::log: 458 case Intrinsic::log2: 459 case Intrinsic::log10: 460 case Intrinsic::exp: 461 case Intrinsic::exp2: 462 case Intrinsic::pow: 463 return Config.VectorizeMath; 464 case Intrinsic::fma: 465 return Config.VectorizeFMA; 466 } 467 } 468 469 // Returns true if J is the second element in some pair referenced by 470 // some multimap pair iterator pair. 471 template <typename V> 472 bool isSecondInIteratorPair(V J, std::pair< 473 typename std::multimap<V, V>::iterator, 474 typename std::multimap<V, V>::iterator> PairRange) { 475 for (typename std::multimap<V, V>::iterator K = PairRange.first; 476 K != PairRange.second; ++K) 477 if (K->second == J) return true; 478 479 return false; 480 } 481 }; 482 483 // This function implements one vectorization iteration on the provided 484 // basic block. It returns true if the block is changed. 485 bool BBVectorize::vectorizePairs(BasicBlock &BB) { 486 bool ShouldContinue; 487 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 488 489 std::vector<Value *> AllPairableInsts; 490 DenseMap<Value *, Value *> AllChosenPairs; 491 492 do { 493 std::vector<Value *> PairableInsts; 494 std::multimap<Value *, Value *> CandidatePairs; 495 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 496 PairableInsts); 497 if (PairableInsts.empty()) continue; 498 499 // Now we have a map of all of the pairable instructions and we need to 500 // select the best possible pairing. A good pairing is one such that the 501 // users of the pair are also paired. This defines a (directed) forest 502 // over the pairs such that two pairs are connected iff the second pair 503 // uses the first. 504 505 // Note that it only matters that both members of the second pair use some 506 // element of the first pair (to allow for splatting). 507 508 std::multimap<ValuePair, ValuePair> ConnectedPairs; 509 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); 510 if (ConnectedPairs.empty()) continue; 511 512 // Build the pairable-instruction dependency map 513 DenseSet<ValuePair> PairableInstUsers; 514 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 515 516 // There is now a graph of the connected pairs. For each variable, pick 517 // the pairing with the largest tree meeting the depth requirement on at 518 // least one branch. Then select all pairings that are part of that tree 519 // and remove them from the list of available pairings and pairable 520 // variables. 521 522 DenseMap<Value *, Value *> ChosenPairs; 523 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, 524 PairableInstUsers, ChosenPairs); 525 526 if (ChosenPairs.empty()) continue; 527 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 528 PairableInsts.end()); 529 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 530 } while (ShouldContinue); 531 532 if (AllChosenPairs.empty()) return false; 533 NumFusedOps += AllChosenPairs.size(); 534 535 // A set of pairs has now been selected. It is now necessary to replace the 536 // paired instructions with vector instructions. For this procedure each 537 // operand must be replaced with a vector operand. This vector is formed 538 // by using build_vector on the old operands. The replaced values are then 539 // replaced with a vector_extract on the result. Subsequent optimization 540 // passes should coalesce the build/extract combinations. 541 542 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); 543 return true; 544 } 545 546 // This function returns true if the provided instruction is capable of being 547 // fused into a vector instruction. This determination is based only on the 548 // type and other attributes of the instruction. 549 bool BBVectorize::isInstVectorizable(Instruction *I, 550 bool &IsSimpleLoadStore) { 551 IsSimpleLoadStore = false; 552 553 if (CallInst *C = dyn_cast<CallInst>(I)) { 554 if (!isVectorizableIntrinsic(C)) 555 return false; 556 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 557 // Vectorize simple loads if possbile: 558 IsSimpleLoadStore = L->isSimple(); 559 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 560 return false; 561 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 562 // Vectorize simple stores if possbile: 563 IsSimpleLoadStore = S->isSimple(); 564 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 565 return false; 566 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 567 // We can vectorize casts, but not casts of pointer types, etc. 568 if (!Config.VectorizeCasts) 569 return false; 570 571 Type *SrcTy = C->getSrcTy(); 572 if (!SrcTy->isSingleValueType()) 573 return false; 574 575 Type *DestTy = C->getDestTy(); 576 if (!DestTy->isSingleValueType()) 577 return false; 578 } else if (isa<SelectInst>(I)) { 579 if (!Config.VectorizeSelect) 580 return false; 581 } else if (isa<CmpInst>(I)) { 582 if (!Config.VectorizeCmp) 583 return false; 584 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 585 if (!Config.VectorizeGEP) 586 return false; 587 588 // Currently, vector GEPs exist only with one index. 589 if (G->getNumIndices() != 1) 590 return false; 591 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 592 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 593 return false; 594 } 595 596 // We can't vectorize memory operations without target data 597 if (TD == 0 && IsSimpleLoadStore) 598 return false; 599 600 Type *T1, *T2; 601 if (isa<StoreInst>(I)) { 602 // For stores, it is the value type, not the pointer type that matters 603 // because the value is what will come from a vector register. 604 605 Value *IVal = cast<StoreInst>(I)->getValueOperand(); 606 T1 = IVal->getType(); 607 } else { 608 T1 = I->getType(); 609 } 610 611 if (I->isCast()) 612 T2 = cast<CastInst>(I)->getSrcTy(); 613 else 614 T2 = T1; 615 616 // Not every type can be vectorized... 617 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 618 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 619 return false; 620 621 if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { 622 if (!Config.VectorizeBools) 623 return false; 624 } else { 625 if (!Config.VectorizeInts 626 && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) 627 return false; 628 } 629 630 if (!Config.VectorizeFloats 631 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 632 return false; 633 634 // Don't vectorize target-specific types. 635 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 636 return false; 637 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 638 return false; 639 640 if ((!Config.VectorizePointers || TD == 0) && 641 (T1->getScalarType()->isPointerTy() || 642 T2->getScalarType()->isPointerTy())) 643 return false; 644 645 if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || 646 T2->getPrimitiveSizeInBits() > Config.VectorBits/2) 647 return false; 648 649 return true; 650 } 651 652 // This function returns true if the two provided instructions are compatible 653 // (meaning that they can be fused into a vector instruction). This assumes 654 // that I has already been determined to be vectorizable and that J is not 655 // in the use tree of I. 656 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 657 bool IsSimpleLoadStore) { 658 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 659 " <-> " << *J << "\n"); 660 661 // Loads and stores can be merged if they have different alignments, 662 // but are otherwise the same. 663 LoadInst *LI, *LJ; 664 StoreInst *SI, *SJ; 665 if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { 666 if (I->getType() != J->getType()) 667 return false; 668 669 if (LI->getPointerOperand()->getType() != 670 LJ->getPointerOperand()->getType() || 671 LI->isVolatile() != LJ->isVolatile() || 672 LI->getOrdering() != LJ->getOrdering() || 673 LI->getSynchScope() != LJ->getSynchScope()) 674 return false; 675 } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { 676 if (SI->getValueOperand()->getType() != 677 SJ->getValueOperand()->getType() || 678 SI->getPointerOperand()->getType() != 679 SJ->getPointerOperand()->getType() || 680 SI->isVolatile() != SJ->isVolatile() || 681 SI->getOrdering() != SJ->getOrdering() || 682 SI->getSynchScope() != SJ->getSynchScope()) 683 return false; 684 } else if (!J->isSameOperationAs(I)) { 685 return false; 686 } 687 // FIXME: handle addsub-type operations! 688 689 if (IsSimpleLoadStore) { 690 Value *IPtr, *JPtr; 691 unsigned IAlignment, JAlignment; 692 int64_t OffsetInElmts = 0; 693 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 694 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 695 if (Config.AlignedOnly) { 696 Type *aType = isa<StoreInst>(I) ? 697 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 698 // An aligned load or store is possible only if the instruction 699 // with the lower offset has an alignment suitable for the 700 // vector type. 701 702 unsigned BottomAlignment = IAlignment; 703 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 704 705 Type *VType = getVecTypeForPair(aType); 706 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 707 if (BottomAlignment < VecAlignment) 708 return false; 709 } 710 } else { 711 return false; 712 } 713 } else if (isa<ShuffleVectorInst>(I)) { 714 // Only merge two shuffles if they're both constant 715 return isa<Constant>(I->getOperand(2)) && 716 isa<Constant>(J->getOperand(2)); 717 // FIXME: We may want to vectorize non-constant shuffles also. 718 } 719 720 // The powi intrinsic is special because only the first argument is 721 // vectorized, the second arguments must be equal. 722 CallInst *CI = dyn_cast<CallInst>(I); 723 Function *FI; 724 if (CI && (FI = CI->getCalledFunction()) && 725 FI->getIntrinsicID() == Intrinsic::powi) { 726 727 Value *A1I = CI->getArgOperand(1), 728 *A1J = cast<CallInst>(J)->getArgOperand(1); 729 const SCEV *A1ISCEV = SE->getSCEV(A1I), 730 *A1JSCEV = SE->getSCEV(A1J); 731 return (A1ISCEV == A1JSCEV); 732 } 733 734 return true; 735 } 736 737 // Figure out whether or not J uses I and update the users and write-set 738 // structures associated with I. Specifically, Users represents the set of 739 // instructions that depend on I. WriteSet represents the set 740 // of memory locations that are dependent on I. If UpdateUsers is true, 741 // and J uses I, then Users is updated to contain J and WriteSet is updated 742 // to contain any memory locations to which J writes. The function returns 743 // true if J uses I. By default, alias analysis is used to determine 744 // whether J reads from memory that overlaps with a location in WriteSet. 745 // If LoadMoveSet is not null, then it is a previously-computed multimap 746 // where the key is the memory-based user instruction and the value is 747 // the instruction to be compared with I. So, if LoadMoveSet is provided, 748 // then the alias analysis is not used. This is necessary because this 749 // function is called during the process of moving instructions during 750 // vectorization and the results of the alias analysis are not stable during 751 // that process. 752 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 753 AliasSetTracker &WriteSet, Instruction *I, 754 Instruction *J, bool UpdateUsers, 755 std::multimap<Value *, Value *> *LoadMoveSet) { 756 bool UsesI = false; 757 758 // This instruction may already be marked as a user due, for example, to 759 // being a member of a selected pair. 760 if (Users.count(J)) 761 UsesI = true; 762 763 if (!UsesI) 764 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 765 JU != JE; ++JU) { 766 Value *V = *JU; 767 if (I == V || Users.count(V)) { 768 UsesI = true; 769 break; 770 } 771 } 772 if (!UsesI && J->mayReadFromMemory()) { 773 if (LoadMoveSet) { 774 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); 775 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); 776 } else { 777 for (AliasSetTracker::iterator W = WriteSet.begin(), 778 WE = WriteSet.end(); W != WE; ++W) { 779 if (W->aliasesUnknownInst(J, *AA)) { 780 UsesI = true; 781 break; 782 } 783 } 784 } 785 } 786 787 if (UsesI && UpdateUsers) { 788 if (J->mayWriteToMemory()) WriteSet.add(J); 789 Users.insert(J); 790 } 791 792 return UsesI; 793 } 794 795 // This function iterates over all instruction pairs in the provided 796 // basic block and collects all candidate pairs for vectorization. 797 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 798 BasicBlock::iterator &Start, 799 std::multimap<Value *, Value *> &CandidatePairs, 800 std::vector<Value *> &PairableInsts) { 801 BasicBlock::iterator E = BB.end(); 802 if (Start == E) return false; 803 804 bool ShouldContinue = false, IAfterStart = false; 805 for (BasicBlock::iterator I = Start++; I != E; ++I) { 806 if (I == Start) IAfterStart = true; 807 808 bool IsSimpleLoadStore; 809 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 810 811 // Look for an instruction with which to pair instruction *I... 812 DenseSet<Value *> Users; 813 AliasSetTracker WriteSet(*AA); 814 bool JAfterStart = IAfterStart; 815 BasicBlock::iterator J = llvm::next(I); 816 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 817 if (J == Start) JAfterStart = true; 818 819 // Determine if J uses I, if so, exit the loop. 820 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 821 if (Config.FastDep) { 822 // Note: For this heuristic to be effective, independent operations 823 // must tend to be intermixed. This is likely to be true from some 824 // kinds of grouped loop unrolling (but not the generic LLVM pass), 825 // but otherwise may require some kind of reordering pass. 826 827 // When using fast dependency analysis, 828 // stop searching after first use: 829 if (UsesI) break; 830 } else { 831 if (UsesI) continue; 832 } 833 834 // J does not use I, and comes before the first use of I, so it can be 835 // merged with I if the instructions are compatible. 836 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; 837 838 // J is a candidate for merging with I. 839 if (!PairableInsts.size() || 840 PairableInsts[PairableInsts.size()-1] != I) { 841 PairableInsts.push_back(I); 842 } 843 844 CandidatePairs.insert(ValuePair(I, J)); 845 846 // The next call to this function must start after the last instruction 847 // selected during this invocation. 848 if (JAfterStart) { 849 Start = llvm::next(J); 850 IAfterStart = JAfterStart = false; 851 } 852 853 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 854 << *I << " <-> " << *J << "\n"); 855 856 // If we have already found too many pairs, break here and this function 857 // will be called again starting after the last instruction selected 858 // during this invocation. 859 if (PairableInsts.size() >= Config.MaxInsts) { 860 ShouldContinue = true; 861 break; 862 } 863 } 864 865 if (ShouldContinue) 866 break; 867 } 868 869 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 870 << " instructions with candidate pairs\n"); 871 872 return ShouldContinue; 873 } 874 875 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 876 // it looks for pairs such that both members have an input which is an 877 // output of PI or PJ. 878 void BBVectorize::computePairsConnectedTo( 879 std::multimap<Value *, Value *> &CandidatePairs, 880 std::vector<Value *> &PairableInsts, 881 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 882 ValuePair P) { 883 StoreInst *SI, *SJ; 884 885 // For each possible pairing for this variable, look at the uses of 886 // the first value... 887 for (Value::use_iterator I = P.first->use_begin(), 888 E = P.first->use_end(); I != E; ++I) { 889 if (isa<LoadInst>(*I)) { 890 // A pair cannot be connected to a load because the load only takes one 891 // operand (the address) and it is a scalar even after vectorization. 892 continue; 893 } else if ((SI = dyn_cast<StoreInst>(*I)) && 894 P.first == SI->getPointerOperand()) { 895 // Similarly, a pair cannot be connected to a store through its 896 // pointer operand. 897 continue; 898 } 899 900 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 901 902 // For each use of the first variable, look for uses of the second 903 // variable... 904 for (Value::use_iterator J = P.second->use_begin(), 905 E2 = P.second->use_end(); J != E2; ++J) { 906 if ((SJ = dyn_cast<StoreInst>(*J)) && 907 P.second == SJ->getPointerOperand()) 908 continue; 909 910 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); 911 912 // Look for <I, J>: 913 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 914 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 915 916 // Look for <J, I>: 917 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) 918 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); 919 } 920 921 if (Config.SplatBreaksChain) continue; 922 // Look for cases where just the first value in the pair is used by 923 // both members of another pair (splatting). 924 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 925 if ((SJ = dyn_cast<StoreInst>(*J)) && 926 P.first == SJ->getPointerOperand()) 927 continue; 928 929 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 930 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 931 } 932 } 933 934 if (Config.SplatBreaksChain) return; 935 // Look for cases where just the second value in the pair is used by 936 // both members of another pair (splatting). 937 for (Value::use_iterator I = P.second->use_begin(), 938 E = P.second->use_end(); I != E; ++I) { 939 if (isa<LoadInst>(*I)) 940 continue; 941 else if ((SI = dyn_cast<StoreInst>(*I)) && 942 P.second == SI->getPointerOperand()) 943 continue; 944 945 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 946 947 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 948 if ((SJ = dyn_cast<StoreInst>(*J)) && 949 P.second == SJ->getPointerOperand()) 950 continue; 951 952 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 953 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 954 } 955 } 956 } 957 958 // This function figures out which pairs are connected. Two pairs are 959 // connected if some output of the first pair forms an input to both members 960 // of the second pair. 961 void BBVectorize::computeConnectedPairs( 962 std::multimap<Value *, Value *> &CandidatePairs, 963 std::vector<Value *> &PairableInsts, 964 std::multimap<ValuePair, ValuePair> &ConnectedPairs) { 965 966 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 967 PE = PairableInsts.end(); PI != PE; ++PI) { 968 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); 969 970 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; 971 P != choiceRange.second; ++P) 972 computePairsConnectedTo(CandidatePairs, PairableInsts, 973 ConnectedPairs, *P); 974 } 975 976 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() 977 << " pair connections.\n"); 978 } 979 980 // This function builds a set of use tuples such that <A, B> is in the set 981 // if B is in the use tree of A. If B is in the use tree of A, then B 982 // depends on the output of A. 983 void BBVectorize::buildDepMap( 984 BasicBlock &BB, 985 std::multimap<Value *, Value *> &CandidatePairs, 986 std::vector<Value *> &PairableInsts, 987 DenseSet<ValuePair> &PairableInstUsers) { 988 DenseSet<Value *> IsInPair; 989 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), 990 E = CandidatePairs.end(); C != E; ++C) { 991 IsInPair.insert(C->first); 992 IsInPair.insert(C->second); 993 } 994 995 // Iterate through the basic block, recording all Users of each 996 // pairable instruction. 997 998 BasicBlock::iterator E = BB.end(); 999 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 1000 if (IsInPair.find(I) == IsInPair.end()) continue; 1001 1002 DenseSet<Value *> Users; 1003 AliasSetTracker WriteSet(*AA); 1004 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) 1005 (void) trackUsesOfI(Users, WriteSet, I, J); 1006 1007 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 1008 U != E; ++U) 1009 PairableInstUsers.insert(ValuePair(I, *U)); 1010 } 1011 } 1012 1013 // Returns true if an input to pair P is an output of pair Q and also an 1014 // input of pair Q is an output of pair P. If this is the case, then these 1015 // two pairs cannot be simultaneously fused. 1016 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 1017 DenseSet<ValuePair> &PairableInstUsers, 1018 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { 1019 // Two pairs are in conflict if they are mutual Users of eachother. 1020 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1021 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1022 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1023 PairableInstUsers.count(ValuePair(P.second, Q.second)); 1024 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1025 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1026 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1027 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1028 if (PairableInstUserMap) { 1029 // FIXME: The expensive part of the cycle check is not so much the cycle 1030 // check itself but this edge insertion procedure. This needs some 1031 // profiling and probably a different data structure (same is true of 1032 // most uses of std::multimap). 1033 if (PUsesQ) { 1034 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); 1035 if (!isSecondInIteratorPair(P, QPairRange)) 1036 PairableInstUserMap->insert(VPPair(Q, P)); 1037 } 1038 if (QUsesP) { 1039 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); 1040 if (!isSecondInIteratorPair(Q, PPairRange)) 1041 PairableInstUserMap->insert(VPPair(P, Q)); 1042 } 1043 } 1044 1045 return (QUsesP && PUsesQ); 1046 } 1047 1048 // This function walks the use graph of current pairs to see if, starting 1049 // from P, the walk returns to P. 1050 bool BBVectorize::pairWillFormCycle(ValuePair P, 1051 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1052 DenseSet<ValuePair> &CurrentPairs) { 1053 DEBUG(if (DebugCycleCheck) 1054 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1055 << *P.second << "\n"); 1056 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1057 // contains non-direct associations. 1058 DenseSet<ValuePair> Visited; 1059 SmallVector<ValuePair, 32> Q; 1060 // General depth-first post-order traversal: 1061 Q.push_back(P); 1062 do { 1063 ValuePair QTop = Q.pop_back_val(); 1064 Visited.insert(QTop); 1065 1066 DEBUG(if (DebugCycleCheck) 1067 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1068 << *QTop.second << "\n"); 1069 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); 1070 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; 1071 C != QPairRange.second; ++C) { 1072 if (C->second == P) { 1073 DEBUG(dbgs() 1074 << "BBV: rejected to prevent non-trivial cycle formation: " 1075 << *C->first.first << " <-> " << *C->first.second << "\n"); 1076 return true; 1077 } 1078 1079 if (CurrentPairs.count(C->second) && !Visited.count(C->second)) 1080 Q.push_back(C->second); 1081 } 1082 } while (!Q.empty()); 1083 1084 return false; 1085 } 1086 1087 // This function builds the initial tree of connected pairs with the 1088 // pair J at the root. 1089 void BBVectorize::buildInitialTreeFor( 1090 std::multimap<Value *, Value *> &CandidatePairs, 1091 std::vector<Value *> &PairableInsts, 1092 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1093 DenseSet<ValuePair> &PairableInstUsers, 1094 DenseMap<Value *, Value *> &ChosenPairs, 1095 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1096 // Each of these pairs is viewed as the root node of a Tree. The Tree 1097 // is then walked (depth-first). As this happens, we keep track of 1098 // the pairs that compose the Tree and the maximum depth of the Tree. 1099 SmallVector<ValuePairWithDepth, 32> Q; 1100 // General depth-first post-order traversal: 1101 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1102 do { 1103 ValuePairWithDepth QTop = Q.back(); 1104 1105 // Push each child onto the queue: 1106 bool MoreChildren = false; 1107 size_t MaxChildDepth = QTop.second; 1108 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); 1109 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; 1110 k != qtRange.second; ++k) { 1111 // Make sure that this child pair is still a candidate: 1112 bool IsStillCand = false; 1113 VPIteratorPair checkRange = 1114 CandidatePairs.equal_range(k->second.first); 1115 for (std::multimap<Value *, Value *>::iterator m = checkRange.first; 1116 m != checkRange.second; ++m) { 1117 if (m->second == k->second.second) { 1118 IsStillCand = true; 1119 break; 1120 } 1121 } 1122 1123 if (IsStillCand) { 1124 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); 1125 if (C == Tree.end()) { 1126 size_t d = getDepthFactor(k->second.first); 1127 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); 1128 MoreChildren = true; 1129 } else { 1130 MaxChildDepth = std::max(MaxChildDepth, C->second); 1131 } 1132 } 1133 } 1134 1135 if (!MoreChildren) { 1136 // Record the current pair as part of the Tree: 1137 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1138 Q.pop_back(); 1139 } 1140 } while (!Q.empty()); 1141 } 1142 1143 // Given some initial tree, prune it by removing conflicting pairs (pairs 1144 // that cannot be simultaneously chosen for vectorization). 1145 void BBVectorize::pruneTreeFor( 1146 std::multimap<Value *, Value *> &CandidatePairs, 1147 std::vector<Value *> &PairableInsts, 1148 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1149 DenseSet<ValuePair> &PairableInstUsers, 1150 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1151 DenseMap<Value *, Value *> &ChosenPairs, 1152 DenseMap<ValuePair, size_t> &Tree, 1153 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1154 bool UseCycleCheck) { 1155 SmallVector<ValuePairWithDepth, 32> Q; 1156 // General depth-first post-order traversal: 1157 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1158 do { 1159 ValuePairWithDepth QTop = Q.pop_back_val(); 1160 PrunedTree.insert(QTop.first); 1161 1162 // Visit each child, pruning as necessary... 1163 DenseMap<ValuePair, size_t> BestChildren; 1164 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); 1165 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; 1166 K != QTopRange.second; ++K) { 1167 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); 1168 if (C == Tree.end()) continue; 1169 1170 // This child is in the Tree, now we need to make sure it is the 1171 // best of any conflicting children. There could be multiple 1172 // conflicting children, so first, determine if we're keeping 1173 // this child, then delete conflicting children as necessary. 1174 1175 // It is also necessary to guard against pairing-induced 1176 // dependencies. Consider instructions a .. x .. y .. b 1177 // such that (a,b) are to be fused and (x,y) are to be fused 1178 // but a is an input to x and b is an output from y. This 1179 // means that y cannot be moved after b but x must be moved 1180 // after b for (a,b) to be fused. In other words, after 1181 // fusing (a,b) we have y .. a/b .. x where y is an input 1182 // to a/b and x is an output to a/b: x and y can no longer 1183 // be legally fused. To prevent this condition, we must 1184 // make sure that a child pair added to the Tree is not 1185 // both an input and output of an already-selected pair. 1186 1187 // Pairing-induced dependencies can also form from more complicated 1188 // cycles. The pair vs. pair conflicts are easy to check, and so 1189 // that is done explicitly for "fast rejection", and because for 1190 // child vs. child conflicts, we may prefer to keep the current 1191 // pair in preference to the already-selected child. 1192 DenseSet<ValuePair> CurrentPairs; 1193 1194 bool CanAdd = true; 1195 for (DenseMap<ValuePair, size_t>::iterator C2 1196 = BestChildren.begin(), E2 = BestChildren.end(); 1197 C2 != E2; ++C2) { 1198 if (C2->first.first == C->first.first || 1199 C2->first.first == C->first.second || 1200 C2->first.second == C->first.first || 1201 C2->first.second == C->first.second || 1202 pairsConflict(C2->first, C->first, PairableInstUsers, 1203 UseCycleCheck ? &PairableInstUserMap : 0)) { 1204 if (C2->second >= C->second) { 1205 CanAdd = false; 1206 break; 1207 } 1208 1209 CurrentPairs.insert(C2->first); 1210 } 1211 } 1212 if (!CanAdd) continue; 1213 1214 // Even worse, this child could conflict with another node already 1215 // selected for the Tree. If that is the case, ignore this child. 1216 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1217 E2 = PrunedTree.end(); T != E2; ++T) { 1218 if (T->first == C->first.first || 1219 T->first == C->first.second || 1220 T->second == C->first.first || 1221 T->second == C->first.second || 1222 pairsConflict(*T, C->first, PairableInstUsers, 1223 UseCycleCheck ? &PairableInstUserMap : 0)) { 1224 CanAdd = false; 1225 break; 1226 } 1227 1228 CurrentPairs.insert(*T); 1229 } 1230 if (!CanAdd) continue; 1231 1232 // And check the queue too... 1233 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1234 E2 = Q.end(); C2 != E2; ++C2) { 1235 if (C2->first.first == C->first.first || 1236 C2->first.first == C->first.second || 1237 C2->first.second == C->first.first || 1238 C2->first.second == C->first.second || 1239 pairsConflict(C2->first, C->first, PairableInstUsers, 1240 UseCycleCheck ? &PairableInstUserMap : 0)) { 1241 CanAdd = false; 1242 break; 1243 } 1244 1245 CurrentPairs.insert(C2->first); 1246 } 1247 if (!CanAdd) continue; 1248 1249 // Last but not least, check for a conflict with any of the 1250 // already-chosen pairs. 1251 for (DenseMap<Value *, Value *>::iterator C2 = 1252 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1253 C2 != E2; ++C2) { 1254 if (pairsConflict(*C2, C->first, PairableInstUsers, 1255 UseCycleCheck ? &PairableInstUserMap : 0)) { 1256 CanAdd = false; 1257 break; 1258 } 1259 1260 CurrentPairs.insert(*C2); 1261 } 1262 if (!CanAdd) continue; 1263 1264 // To check for non-trivial cycles formed by the addition of the 1265 // current pair we've formed a list of all relevant pairs, now use a 1266 // graph walk to check for a cycle. We start from the current pair and 1267 // walk the use tree to see if we again reach the current pair. If we 1268 // do, then the current pair is rejected. 1269 1270 // FIXME: It may be more efficient to use a topological-ordering 1271 // algorithm to improve the cycle check. This should be investigated. 1272 if (UseCycleCheck && 1273 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1274 continue; 1275 1276 // This child can be added, but we may have chosen it in preference 1277 // to an already-selected child. Check for this here, and if a 1278 // conflict is found, then remove the previously-selected child 1279 // before adding this one in its place. 1280 for (DenseMap<ValuePair, size_t>::iterator C2 1281 = BestChildren.begin(); C2 != BestChildren.end();) { 1282 if (C2->first.first == C->first.first || 1283 C2->first.first == C->first.second || 1284 C2->first.second == C->first.first || 1285 C2->first.second == C->first.second || 1286 pairsConflict(C2->first, C->first, PairableInstUsers)) 1287 BestChildren.erase(C2++); 1288 else 1289 ++C2; 1290 } 1291 1292 BestChildren.insert(ValuePairWithDepth(C->first, C->second)); 1293 } 1294 1295 for (DenseMap<ValuePair, size_t>::iterator C 1296 = BestChildren.begin(), E2 = BestChildren.end(); 1297 C != E2; ++C) { 1298 size_t DepthF = getDepthFactor(C->first.first); 1299 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1300 } 1301 } while (!Q.empty()); 1302 } 1303 1304 // This function finds the best tree of mututally-compatible connected 1305 // pairs, given the choice of root pairs as an iterator range. 1306 void BBVectorize::findBestTreeFor( 1307 std::multimap<Value *, Value *> &CandidatePairs, 1308 std::vector<Value *> &PairableInsts, 1309 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1310 DenseSet<ValuePair> &PairableInstUsers, 1311 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1312 DenseMap<Value *, Value *> &ChosenPairs, 1313 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1314 size_t &BestEffSize, VPIteratorPair ChoiceRange, 1315 bool UseCycleCheck) { 1316 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; 1317 J != ChoiceRange.second; ++J) { 1318 1319 // Before going any further, make sure that this pair does not 1320 // conflict with any already-selected pairs (see comment below 1321 // near the Tree pruning for more details). 1322 DenseSet<ValuePair> ChosenPairSet; 1323 bool DoesConflict = false; 1324 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1325 E = ChosenPairs.end(); C != E; ++C) { 1326 if (pairsConflict(*C, *J, PairableInstUsers, 1327 UseCycleCheck ? &PairableInstUserMap : 0)) { 1328 DoesConflict = true; 1329 break; 1330 } 1331 1332 ChosenPairSet.insert(*C); 1333 } 1334 if (DoesConflict) continue; 1335 1336 if (UseCycleCheck && 1337 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) 1338 continue; 1339 1340 DenseMap<ValuePair, size_t> Tree; 1341 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1342 PairableInstUsers, ChosenPairs, Tree, *J); 1343 1344 // Because we'll keep the child with the largest depth, the largest 1345 // depth is still the same in the unpruned Tree. 1346 size_t MaxDepth = Tree.lookup(*J); 1347 1348 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1349 << *J->first << " <-> " << *J->second << "} of depth " << 1350 MaxDepth << " and size " << Tree.size() << "\n"); 1351 1352 // At this point the Tree has been constructed, but, may contain 1353 // contradictory children (meaning that different children of 1354 // some tree node may be attempting to fuse the same instruction). 1355 // So now we walk the tree again, in the case of a conflict, 1356 // keep only the child with the largest depth. To break a tie, 1357 // favor the first child. 1358 1359 DenseSet<ValuePair> PrunedTree; 1360 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1361 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, 1362 PrunedTree, *J, UseCycleCheck); 1363 1364 size_t EffSize = 0; 1365 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1366 E = PrunedTree.end(); S != E; ++S) 1367 EffSize += getDepthFactor(S->first); 1368 1369 DEBUG(if (DebugPairSelection) 1370 dbgs() << "BBV: found pruned Tree for pair {" 1371 << *J->first << " <-> " << *J->second << "} of depth " << 1372 MaxDepth << " and size " << PrunedTree.size() << 1373 " (effective size: " << EffSize << ")\n"); 1374 if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { 1375 BestMaxDepth = MaxDepth; 1376 BestEffSize = EffSize; 1377 BestTree = PrunedTree; 1378 } 1379 } 1380 } 1381 1382 // Given the list of candidate pairs, this function selects those 1383 // that will be fused into vector instructions. 1384 void BBVectorize::choosePairs( 1385 std::multimap<Value *, Value *> &CandidatePairs, 1386 std::vector<Value *> &PairableInsts, 1387 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1388 DenseSet<ValuePair> &PairableInstUsers, 1389 DenseMap<Value *, Value *>& ChosenPairs) { 1390 bool UseCycleCheck = 1391 CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; 1392 std::multimap<ValuePair, ValuePair> PairableInstUserMap; 1393 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 1394 E = PairableInsts.end(); I != E; ++I) { 1395 // The number of possible pairings for this variable: 1396 size_t NumChoices = CandidatePairs.count(*I); 1397 if (!NumChoices) continue; 1398 1399 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); 1400 1401 // The best pair to choose and its tree: 1402 size_t BestMaxDepth = 0, BestEffSize = 0; 1403 DenseSet<ValuePair> BestTree; 1404 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1405 PairableInstUsers, PairableInstUserMap, ChosenPairs, 1406 BestTree, BestMaxDepth, BestEffSize, ChoiceRange, 1407 UseCycleCheck); 1408 1409 // A tree has been chosen (or not) at this point. If no tree was 1410 // chosen, then this instruction, I, cannot be paired (and is no longer 1411 // considered). 1412 1413 DEBUG(if (BestTree.size() > 0) 1414 dbgs() << "BBV: selected pairs in the best tree for: " 1415 << *cast<Instruction>(*I) << "\n"); 1416 1417 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 1418 SE2 = BestTree.end(); S != SE2; ++S) { 1419 // Insert the members of this tree into the list of chosen pairs. 1420 ChosenPairs.insert(ValuePair(S->first, S->second)); 1421 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 1422 *S->second << "\n"); 1423 1424 // Remove all candidate pairs that have values in the chosen tree. 1425 for (std::multimap<Value *, Value *>::iterator K = 1426 CandidatePairs.begin(); K != CandidatePairs.end();) { 1427 if (K->first == S->first || K->second == S->first || 1428 K->second == S->second || K->first == S->second) { 1429 // Don't remove the actual pair chosen so that it can be used 1430 // in subsequent tree selections. 1431 if (!(K->first == S->first && K->second == S->second)) 1432 CandidatePairs.erase(K++); 1433 else 1434 ++K; 1435 } else { 1436 ++K; 1437 } 1438 } 1439 } 1440 } 1441 1442 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 1443 } 1444 1445 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 1446 unsigned n = 0) { 1447 if (!I->hasName()) 1448 return ""; 1449 1450 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 1451 (n > 0 ? "." + utostr(n) : "")).str(); 1452 } 1453 1454 // Returns the value that is to be used as the pointer input to the vector 1455 // instruction that fuses I with J. 1456 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 1457 Instruction *I, Instruction *J, unsigned o, 1458 bool &FlipMemInputs) { 1459 Value *IPtr, *JPtr; 1460 unsigned IAlignment, JAlignment; 1461 int64_t OffsetInElmts; 1462 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 1463 OffsetInElmts); 1464 1465 // The pointer value is taken to be the one with the lowest offset. 1466 Value *VPtr; 1467 if (OffsetInElmts > 0) { 1468 VPtr = IPtr; 1469 } else { 1470 FlipMemInputs = true; 1471 VPtr = JPtr; 1472 } 1473 1474 Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); 1475 Type *VArgType = getVecTypeForPair(ArgType); 1476 Type *VArgPtrType = PointerType::get(VArgType, 1477 cast<PointerType>(IPtr->getType())->getAddressSpace()); 1478 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 1479 /* insert before */ FlipMemInputs ? J : I); 1480 } 1481 1482 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 1483 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 1484 unsigned IdxOffset, std::vector<Constant*> &Mask) { 1485 for (unsigned v = 0; v < NumElem/2; ++v) { 1486 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 1487 if (m < 0) { 1488 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 1489 } else { 1490 unsigned mm = m + (int) IdxOffset; 1491 if (m >= (int) NumInElem) 1492 mm += (int) NumInElem; 1493 1494 Mask[v+MaskOffset] = 1495 ConstantInt::get(Type::getInt32Ty(Context), mm); 1496 } 1497 } 1498 } 1499 1500 // Returns the value that is to be used as the vector-shuffle mask to the 1501 // vector instruction that fuses I with J. 1502 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 1503 Instruction *I, Instruction *J) { 1504 // This is the shuffle mask. We need to append the second 1505 // mask to the first, and the numbers need to be adjusted. 1506 1507 Type *ArgType = I->getType(); 1508 Type *VArgType = getVecTypeForPair(ArgType); 1509 1510 // Get the total number of elements in the fused vector type. 1511 // By definition, this must equal the number of elements in 1512 // the final mask. 1513 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 1514 std::vector<Constant*> Mask(NumElem); 1515 1516 Type *OpType = I->getOperand(0)->getType(); 1517 unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); 1518 1519 // For the mask from the first pair... 1520 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); 1521 1522 // For the mask from the second pair... 1523 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, 1524 Mask); 1525 1526 return ConstantVector::get(Mask); 1527 } 1528 1529 // Returns the value to be used as the specified operand of the vector 1530 // instruction that fuses I with J. 1531 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 1532 Instruction *J, unsigned o, bool FlipMemInputs) { 1533 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1534 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1535 1536 // Compute the fused vector type for this operand 1537 Type *ArgType = I->getOperand(o)->getType(); 1538 VectorType *VArgType = getVecTypeForPair(ArgType); 1539 1540 Instruction *L = I, *H = J; 1541 if (FlipMemInputs) { 1542 L = J; 1543 H = I; 1544 } 1545 1546 if (ArgType->isVectorTy()) { 1547 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 1548 std::vector<Constant*> Mask(numElem); 1549 for (unsigned v = 0; v < numElem; ++v) 1550 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1551 1552 Instruction *BV = new ShuffleVectorInst(L->getOperand(o), 1553 H->getOperand(o), 1554 ConstantVector::get(Mask), 1555 getReplacementName(I, true, o)); 1556 BV->insertBefore(J); 1557 return BV; 1558 } 1559 1560 // If these two inputs are the output of another vector instruction, 1561 // then we should use that output directly. It might be necessary to 1562 // permute it first. [When pairings are fused recursively, you can 1563 // end up with cases where a large vector is decomposed into scalars 1564 // using extractelement instructions, then built into size-2 1565 // vectors using insertelement and the into larger vectors using 1566 // shuffles. InstCombine does not simplify all of these cases well, 1567 // and so we make sure that shuffles are generated here when possible. 1568 ExtractElementInst *LEE 1569 = dyn_cast<ExtractElementInst>(L->getOperand(o)); 1570 ExtractElementInst *HEE 1571 = dyn_cast<ExtractElementInst>(H->getOperand(o)); 1572 1573 if (LEE && HEE && 1574 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { 1575 VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); 1576 unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); 1577 unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); 1578 if (LEE->getOperand(0) == HEE->getOperand(0)) { 1579 if (LowIndx == 0 && HighIndx == 1) 1580 return LEE->getOperand(0); 1581 1582 std::vector<Constant*> Mask(2); 1583 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1584 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1585 1586 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1587 UndefValue::get(EEType), 1588 ConstantVector::get(Mask), 1589 getReplacementName(I, true, o)); 1590 BV->insertBefore(J); 1591 return BV; 1592 } 1593 1594 std::vector<Constant*> Mask(2); 1595 HighIndx += EEType->getNumElements(); 1596 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1597 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1598 1599 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1600 HEE->getOperand(0), 1601 ConstantVector::get(Mask), 1602 getReplacementName(I, true, o)); 1603 BV->insertBefore(J); 1604 return BV; 1605 } 1606 1607 Instruction *BV1 = InsertElementInst::Create( 1608 UndefValue::get(VArgType), 1609 L->getOperand(o), CV0, 1610 getReplacementName(I, true, o, 1)); 1611 BV1->insertBefore(I); 1612 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), 1613 CV1, 1614 getReplacementName(I, true, o, 2)); 1615 BV2->insertBefore(J); 1616 return BV2; 1617 } 1618 1619 // This function creates an array of values that will be used as the inputs 1620 // to the vector instruction that fuses I with J. 1621 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 1622 Instruction *I, Instruction *J, 1623 SmallVector<Value *, 3> &ReplacedOperands, 1624 bool &FlipMemInputs) { 1625 FlipMemInputs = false; 1626 unsigned NumOperands = I->getNumOperands(); 1627 1628 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 1629 // Iterate backward so that we look at the store pointer 1630 // first and know whether or not we need to flip the inputs. 1631 1632 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 1633 // This is the pointer for a load/store instruction. 1634 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, 1635 FlipMemInputs); 1636 continue; 1637 } else if (isa<CallInst>(I)) { 1638 Function *F = cast<CallInst>(I)->getCalledFunction(); 1639 unsigned IID = F->getIntrinsicID(); 1640 if (o == NumOperands-1) { 1641 BasicBlock &BB = *I->getParent(); 1642 1643 Module *M = BB.getParent()->getParent(); 1644 Type *ArgType = I->getType(); 1645 Type *VArgType = getVecTypeForPair(ArgType); 1646 1647 // FIXME: is it safe to do this here? 1648 ReplacedOperands[o] = Intrinsic::getDeclaration(M, 1649 (Intrinsic::ID) IID, VArgType); 1650 continue; 1651 } else if (IID == Intrinsic::powi && o == 1) { 1652 // The second argument of powi is a single integer and we've already 1653 // checked that both arguments are equal. As a result, we just keep 1654 // I's second argument. 1655 ReplacedOperands[o] = I->getOperand(o); 1656 continue; 1657 } 1658 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 1659 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 1660 continue; 1661 } 1662 1663 ReplacedOperands[o] = 1664 getReplacementInput(Context, I, J, o, FlipMemInputs); 1665 } 1666 } 1667 1668 // This function creates two values that represent the outputs of the 1669 // original I and J instructions. These are generally vector shuffles 1670 // or extracts. In many cases, these will end up being unused and, thus, 1671 // eliminated by later passes. 1672 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 1673 Instruction *J, Instruction *K, 1674 Instruction *&InsertionPt, 1675 Instruction *&K1, Instruction *&K2, 1676 bool &FlipMemInputs) { 1677 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1678 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1679 1680 if (isa<StoreInst>(I)) { 1681 AA->replaceWithNewValue(I, K); 1682 AA->replaceWithNewValue(J, K); 1683 } else { 1684 Type *IType = I->getType(); 1685 Type *VType = getVecTypeForPair(IType); 1686 1687 if (IType->isVectorTy()) { 1688 unsigned numElem = cast<VectorType>(IType)->getNumElements(); 1689 std::vector<Constant*> Mask1(numElem), Mask2(numElem); 1690 for (unsigned v = 0; v < numElem; ++v) { 1691 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1692 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); 1693 } 1694 1695 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 1696 ConstantVector::get( 1697 FlipMemInputs ? Mask2 : Mask1), 1698 getReplacementName(K, false, 1)); 1699 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 1700 ConstantVector::get( 1701 FlipMemInputs ? Mask1 : Mask2), 1702 getReplacementName(K, false, 2)); 1703 } else { 1704 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, 1705 getReplacementName(K, false, 1)); 1706 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, 1707 getReplacementName(K, false, 2)); 1708 } 1709 1710 K1->insertAfter(K); 1711 K2->insertAfter(K1); 1712 InsertionPt = K2; 1713 } 1714 } 1715 1716 // Move all uses of the function I (including pairing-induced uses) after J. 1717 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 1718 std::multimap<Value *, Value *> &LoadMoveSet, 1719 Instruction *I, Instruction *J) { 1720 // Skip to the first instruction past I. 1721 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1722 1723 DenseSet<Value *> Users; 1724 AliasSetTracker WriteSet(*AA); 1725 for (; cast<Instruction>(L) != J; ++L) 1726 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); 1727 1728 assert(cast<Instruction>(L) == J && 1729 "Tracking has not proceeded far enough to check for dependencies"); 1730 // If J is now in the use set of I, then trackUsesOfI will return true 1731 // and we have a dependency cycle (and the fusing operation must abort). 1732 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); 1733 } 1734 1735 // Move all uses of the function I (including pairing-induced uses) after J. 1736 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 1737 std::multimap<Value *, Value *> &LoadMoveSet, 1738 Instruction *&InsertionPt, 1739 Instruction *I, Instruction *J) { 1740 // Skip to the first instruction past I. 1741 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1742 1743 DenseSet<Value *> Users; 1744 AliasSetTracker WriteSet(*AA); 1745 for (; cast<Instruction>(L) != J;) { 1746 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { 1747 // Move this instruction 1748 Instruction *InstToMove = L; ++L; 1749 1750 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 1751 " to after " << *InsertionPt << "\n"); 1752 InstToMove->removeFromParent(); 1753 InstToMove->insertAfter(InsertionPt); 1754 InsertionPt = InstToMove; 1755 } else { 1756 ++L; 1757 } 1758 } 1759 } 1760 1761 // Collect all load instruction that are in the move set of a given first 1762 // pair member. These loads depend on the first instruction, I, and so need 1763 // to be moved after J (the second instruction) when the pair is fused. 1764 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 1765 DenseMap<Value *, Value *> &ChosenPairs, 1766 std::multimap<Value *, Value *> &LoadMoveSet, 1767 Instruction *I) { 1768 // Skip to the first instruction past I. 1769 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1770 1771 DenseSet<Value *> Users; 1772 AliasSetTracker WriteSet(*AA); 1773 1774 // Note: We cannot end the loop when we reach J because J could be moved 1775 // farther down the use chain by another instruction pairing. Also, J 1776 // could be before I if this is an inverted input. 1777 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 1778 if (trackUsesOfI(Users, WriteSet, I, L)) { 1779 if (L->mayReadFromMemory()) 1780 LoadMoveSet.insert(ValuePair(L, I)); 1781 } 1782 } 1783 } 1784 1785 // In cases where both load/stores and the computation of their pointers 1786 // are chosen for vectorization, we can end up in a situation where the 1787 // aliasing analysis starts returning different query results as the 1788 // process of fusing instruction pairs continues. Because the algorithm 1789 // relies on finding the same use trees here as were found earlier, we'll 1790 // need to precompute the necessary aliasing information here and then 1791 // manually update it during the fusion process. 1792 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 1793 std::vector<Value *> &PairableInsts, 1794 DenseMap<Value *, Value *> &ChosenPairs, 1795 std::multimap<Value *, Value *> &LoadMoveSet) { 1796 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1797 PIE = PairableInsts.end(); PI != PIE; ++PI) { 1798 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 1799 if (P == ChosenPairs.end()) continue; 1800 1801 Instruction *I = cast<Instruction>(P->first); 1802 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); 1803 } 1804 } 1805 1806 // When the first instruction in each pair is cloned, it will inherit its 1807 // parent's metadata. This metadata must be combined with that of the other 1808 // instruction in a safe way. 1809 void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { 1810 SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; 1811 K->getAllMetadataOtherThanDebugLoc(Metadata); 1812 for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { 1813 unsigned Kind = Metadata[i].first; 1814 MDNode *JMD = J->getMetadata(Kind); 1815 MDNode *KMD = Metadata[i].second; 1816 1817 switch (Kind) { 1818 default: 1819 K->setMetadata(Kind, 0); // Remove unknown metadata 1820 break; 1821 case LLVMContext::MD_tbaa: 1822 K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 1823 break; 1824 case LLVMContext::MD_fpmath: 1825 K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 1826 break; 1827 } 1828 } 1829 } 1830 1831 // This function fuses the chosen instruction pairs into vector instructions, 1832 // taking care preserve any needed scalar outputs and, then, it reorders the 1833 // remaining instructions as needed (users of the first member of the pair 1834 // need to be moved to after the location of the second member of the pair 1835 // because the vector instruction is inserted in the location of the pair's 1836 // second member). 1837 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 1838 std::vector<Value *> &PairableInsts, 1839 DenseMap<Value *, Value *> &ChosenPairs) { 1840 LLVMContext& Context = BB.getContext(); 1841 1842 // During the vectorization process, the order of the pairs to be fused 1843 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 1844 // list. After a pair is fused, the flipped pair is removed from the list. 1845 std::vector<ValuePair> FlippedPairs; 1846 FlippedPairs.reserve(ChosenPairs.size()); 1847 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 1848 E = ChosenPairs.end(); P != E; ++P) 1849 FlippedPairs.push_back(ValuePair(P->second, P->first)); 1850 for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), 1851 E = FlippedPairs.end(); P != E; ++P) 1852 ChosenPairs.insert(*P); 1853 1854 std::multimap<Value *, Value *> LoadMoveSet; 1855 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); 1856 1857 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 1858 1859 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 1860 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 1861 if (P == ChosenPairs.end()) { 1862 ++PI; 1863 continue; 1864 } 1865 1866 if (getDepthFactor(P->first) == 0) { 1867 // These instructions are not really fused, but are tracked as though 1868 // they are. Any case in which it would be interesting to fuse them 1869 // will be taken care of by InstCombine. 1870 --NumFusedOps; 1871 ++PI; 1872 continue; 1873 } 1874 1875 Instruction *I = cast<Instruction>(P->first), 1876 *J = cast<Instruction>(P->second); 1877 1878 DEBUG(dbgs() << "BBV: fusing: " << *I << 1879 " <-> " << *J << "\n"); 1880 1881 // Remove the pair and flipped pair from the list. 1882 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 1883 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 1884 ChosenPairs.erase(FP); 1885 ChosenPairs.erase(P); 1886 1887 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { 1888 DEBUG(dbgs() << "BBV: fusion of: " << *I << 1889 " <-> " << *J << 1890 " aborted because of non-trivial dependency cycle\n"); 1891 --NumFusedOps; 1892 ++PI; 1893 continue; 1894 } 1895 1896 bool FlipMemInputs; 1897 unsigned NumOperands = I->getNumOperands(); 1898 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 1899 getReplacementInputsForPair(Context, I, J, ReplacedOperands, 1900 FlipMemInputs); 1901 1902 // Make a copy of the original operation, change its type to the vector 1903 // type and replace its operands with the vector operands. 1904 Instruction *K = I->clone(); 1905 if (I->hasName()) K->takeName(I); 1906 1907 if (!isa<StoreInst>(K)) 1908 K->mutateType(getVecTypeForPair(I->getType())); 1909 1910 combineMetadata(K, J); 1911 1912 for (unsigned o = 0; o < NumOperands; ++o) 1913 K->setOperand(o, ReplacedOperands[o]); 1914 1915 // If we've flipped the memory inputs, make sure that we take the correct 1916 // alignment. 1917 if (FlipMemInputs) { 1918 if (isa<StoreInst>(K)) 1919 cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); 1920 else 1921 cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); 1922 } 1923 1924 K->insertAfter(J); 1925 1926 // Instruction insertion point: 1927 Instruction *InsertionPt = K; 1928 Instruction *K1 = 0, *K2 = 0; 1929 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, 1930 FlipMemInputs); 1931 1932 // The use tree of the first original instruction must be moved to after 1933 // the location of the second instruction. The entire use tree of the 1934 // first instruction is disjoint from the input tree of the second 1935 // (by definition), and so commutes with it. 1936 1937 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); 1938 1939 if (!isa<StoreInst>(I)) { 1940 I->replaceAllUsesWith(K1); 1941 J->replaceAllUsesWith(K2); 1942 AA->replaceWithNewValue(I, K1); 1943 AA->replaceWithNewValue(J, K2); 1944 } 1945 1946 // Instructions that may read from memory may be in the load move set. 1947 // Once an instruction is fused, we no longer need its move set, and so 1948 // the values of the map never need to be updated. However, when a load 1949 // is fused, we need to merge the entries from both instructions in the 1950 // pair in case those instructions were in the move set of some other 1951 // yet-to-be-fused pair. The loads in question are the keys of the map. 1952 if (I->mayReadFromMemory()) { 1953 std::vector<ValuePair> NewSetMembers; 1954 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); 1955 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); 1956 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; 1957 N != IPairRange.second; ++N) 1958 NewSetMembers.push_back(ValuePair(K, N->second)); 1959 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; 1960 N != JPairRange.second; ++N) 1961 NewSetMembers.push_back(ValuePair(K, N->second)); 1962 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 1963 AE = NewSetMembers.end(); A != AE; ++A) 1964 LoadMoveSet.insert(*A); 1965 } 1966 1967 // Before removing I, set the iterator to the next instruction. 1968 PI = llvm::next(BasicBlock::iterator(I)); 1969 if (cast<Instruction>(PI) == J) 1970 ++PI; 1971 1972 SE->forgetValue(I); 1973 SE->forgetValue(J); 1974 I->eraseFromParent(); 1975 J->eraseFromParent(); 1976 } 1977 1978 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 1979 } 1980} 1981 1982char BBVectorize::ID = 0; 1983static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 1984INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1985INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1986INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1987INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1988 1989BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 1990 return new BBVectorize(C); 1991} 1992 1993bool 1994llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 1995 BBVectorize BBVectorizer(P, C); 1996 return BBVectorizer.vectorizeBB(BB); 1997} 1998 1999//===----------------------------------------------------------------------===// 2000VectorizeConfig::VectorizeConfig() { 2001 VectorBits = ::VectorBits; 2002 VectorizeBools = !::NoBools; 2003 VectorizeInts = !::NoInts; 2004 VectorizeFloats = !::NoFloats; 2005 VectorizePointers = !::NoPointers; 2006 VectorizeCasts = !::NoCasts; 2007 VectorizeMath = !::NoMath; 2008 VectorizeFMA = !::NoFMA; 2009 VectorizeSelect = !::NoSelect; 2010 VectorizeCmp = !::NoCmp; 2011 VectorizeGEP = !::NoGEP; 2012 VectorizeMemOps = !::NoMemOps; 2013 AlignedOnly = ::AlignedOnly; 2014 ReqChainDepth= ::ReqChainDepth; 2015 SearchLimit = ::SearchLimit; 2016 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 2017 SplatBreaksChain = ::SplatBreaksChain; 2018 MaxInsts = ::MaxInsts; 2019 MaxIter = ::MaxIter; 2020 NoMemOpBoost = ::NoMemOpBoost; 2021 FastDep = ::FastDep; 2022} 2023