BBVectorize.cpp revision ec5c3c2bd37828e9fbd913f5ac7b7f75711ddd59
1//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a basic-block vectorization pass. The algorithm was 11// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12// et al. It works by looking for chains of pairable operations and then 13// pairing them. 14// 15//===----------------------------------------------------------------------===// 16 17#define BBV_NAME "bb-vectorize" 18#define DEBUG_TYPE BBV_NAME 19#include "llvm/Transforms/Vectorize.h" 20#include "llvm/ADT/DenseMap.h" 21#include "llvm/ADT/DenseSet.h" 22#include "llvm/ADT/STLExtras.h" 23#include "llvm/ADT/SmallSet.h" 24#include "llvm/ADT/SmallVector.h" 25#include "llvm/ADT/Statistic.h" 26#include "llvm/ADT/StringExtras.h" 27#include "llvm/Analysis/AliasAnalysis.h" 28#include "llvm/Analysis/AliasSetTracker.h" 29#include "llvm/Analysis/Dominators.h" 30#include "llvm/Analysis/ScalarEvolution.h" 31#include "llvm/Analysis/ScalarEvolutionExpressions.h" 32#include "llvm/Analysis/TargetTransformInfo.h" 33#include "llvm/Analysis/ValueTracking.h" 34#include "llvm/IR/Constants.h" 35#include "llvm/IR/DataLayout.h" 36#include "llvm/IR/DerivedTypes.h" 37#include "llvm/IR/Function.h" 38#include "llvm/IR/Instructions.h" 39#include "llvm/IR/IntrinsicInst.h" 40#include "llvm/IR/Intrinsics.h" 41#include "llvm/IR/LLVMContext.h" 42#include "llvm/IR/Metadata.h" 43#include "llvm/IR/Type.h" 44#include "llvm/Pass.h" 45#include "llvm/Support/CommandLine.h" 46#include "llvm/Support/Debug.h" 47#include "llvm/Support/ValueHandle.h" 48#include "llvm/Support/raw_ostream.h" 49#include "llvm/Transforms/Utils/Local.h" 50#include <algorithm> 51#include <map> 52using namespace llvm; 53 54static cl::opt<bool> 55IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), 56 cl::Hidden, cl::desc("Ignore target information")); 57 58static cl::opt<unsigned> 59ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 60 cl::desc("The required chain depth for vectorization")); 61 62static cl::opt<bool> 63UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), 64 cl::Hidden, cl::desc("Use the chain depth requirement with" 65 " target information")); 66 67static cl::opt<unsigned> 68SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 69 cl::desc("The maximum search distance for instruction pairs")); 70 71static cl::opt<bool> 72SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 73 cl::desc("Replicating one element to a pair breaks the chain")); 74 75static cl::opt<unsigned> 76VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 77 cl::desc("The size of the native vector registers")); 78 79static cl::opt<unsigned> 80MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 81 cl::desc("The maximum number of pairing iterations")); 82 83static cl::opt<bool> 84Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, 85 cl::desc("Don't try to form non-2^n-length vectors")); 86 87static cl::opt<unsigned> 88MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 89 cl::desc("The maximum number of pairable instructions per group")); 90 91static cl::opt<unsigned> 92MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 93 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 94 " a full cycle check")); 95 96static cl::opt<bool> 97NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 98 cl::desc("Don't try to vectorize boolean (i1) values")); 99 100static cl::opt<bool> 101NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 102 cl::desc("Don't try to vectorize integer values")); 103 104static cl::opt<bool> 105NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 106 cl::desc("Don't try to vectorize floating-point values")); 107 108// FIXME: This should default to false once pointer vector support works. 109static cl::opt<bool> 110NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, 111 cl::desc("Don't try to vectorize pointer values")); 112 113static cl::opt<bool> 114NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 115 cl::desc("Don't try to vectorize casting (conversion) operations")); 116 117static cl::opt<bool> 118NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 119 cl::desc("Don't try to vectorize floating-point math intrinsics")); 120 121static cl::opt<bool> 122NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 123 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 124 125static cl::opt<bool> 126NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 127 cl::desc("Don't try to vectorize select instructions")); 128 129static cl::opt<bool> 130NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 131 cl::desc("Don't try to vectorize comparison instructions")); 132 133static cl::opt<bool> 134NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 135 cl::desc("Don't try to vectorize getelementptr instructions")); 136 137static cl::opt<bool> 138NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 139 cl::desc("Don't try to vectorize loads and stores")); 140 141static cl::opt<bool> 142AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 143 cl::desc("Only generate aligned loads and stores")); 144 145static cl::opt<bool> 146NoMemOpBoost("bb-vectorize-no-mem-op-boost", 147 cl::init(false), cl::Hidden, 148 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 149 150static cl::opt<bool> 151FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 152 cl::desc("Use a fast instruction dependency analysis")); 153 154#ifndef NDEBUG 155static cl::opt<bool> 156DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 157 cl::init(false), cl::Hidden, 158 cl::desc("When debugging is enabled, output information on the" 159 " instruction-examination process")); 160static cl::opt<bool> 161DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 162 cl::init(false), cl::Hidden, 163 cl::desc("When debugging is enabled, output information on the" 164 " candidate-selection process")); 165static cl::opt<bool> 166DebugPairSelection("bb-vectorize-debug-pair-selection", 167 cl::init(false), cl::Hidden, 168 cl::desc("When debugging is enabled, output information on the" 169 " pair-selection process")); 170static cl::opt<bool> 171DebugCycleCheck("bb-vectorize-debug-cycle-check", 172 cl::init(false), cl::Hidden, 173 cl::desc("When debugging is enabled, output information on the" 174 " cycle-checking process")); 175 176static cl::opt<bool> 177PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", 178 cl::init(false), cl::Hidden, 179 cl::desc("When debugging is enabled, dump the basic block after" 180 " every pair is fused")); 181#endif 182 183STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 184 185namespace { 186 struct BBVectorize : public BasicBlockPass { 187 static char ID; // Pass identification, replacement for typeid 188 189 const VectorizeConfig Config; 190 191 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 192 : BasicBlockPass(ID), Config(C) { 193 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 194 } 195 196 BBVectorize(Pass *P, const VectorizeConfig &C) 197 : BasicBlockPass(ID), Config(C) { 198 AA = &P->getAnalysis<AliasAnalysis>(); 199 DT = &P->getAnalysis<DominatorTree>(); 200 SE = &P->getAnalysis<ScalarEvolution>(); 201 TD = P->getAnalysisIfAvailable<DataLayout>(); 202 TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); 203 } 204 205 typedef std::pair<Value *, Value *> ValuePair; 206 typedef std::pair<ValuePair, int> ValuePairWithCost; 207 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 208 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 209 typedef std::pair<VPPair, unsigned> VPPairWithType; 210 typedef std::pair<std::multimap<Value *, Value *>::iterator, 211 std::multimap<Value *, Value *>::iterator> VPIteratorPair; 212 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, 213 std::multimap<ValuePair, ValuePair>::iterator> 214 VPPIteratorPair; 215 216 AliasAnalysis *AA; 217 DominatorTree *DT; 218 ScalarEvolution *SE; 219 DataLayout *TD; 220 const TargetTransformInfo *TTI; 221 222 // FIXME: const correct? 223 224 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); 225 226 bool getCandidatePairs(BasicBlock &BB, 227 BasicBlock::iterator &Start, 228 std::multimap<Value *, Value *> &CandidatePairs, 229 DenseSet<ValuePair> &FixedOrderPairs, 230 DenseMap<ValuePair, int> &CandidatePairCostSavings, 231 std::vector<Value *> &PairableInsts, bool NonPow2Len); 232 233 // FIXME: The current implementation does not account for pairs that 234 // are connected in multiple ways. For example: 235 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) 236 enum PairConnectionType { 237 PairConnectionDirect, 238 PairConnectionSwap, 239 PairConnectionSplat 240 }; 241 242 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, 243 std::vector<Value *> &PairableInsts, 244 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 245 DenseMap<VPPair, unsigned> &PairConnectionTypes); 246 247 void buildDepMap(BasicBlock &BB, 248 std::multimap<Value *, Value *> &CandidatePairs, 249 std::vector<Value *> &PairableInsts, 250 DenseSet<ValuePair> &PairableInstUsers); 251 252 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, 253 DenseMap<ValuePair, int> &CandidatePairCostSavings, 254 std::vector<Value *> &PairableInsts, 255 DenseSet<ValuePair> &FixedOrderPairs, 256 DenseMap<VPPair, unsigned> &PairConnectionTypes, 257 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 258 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, 259 DenseSet<ValuePair> &PairableInstUsers, 260 DenseMap<Value *, Value *>& ChosenPairs); 261 262 void fuseChosenPairs(BasicBlock &BB, 263 std::vector<Value *> &PairableInsts, 264 DenseMap<Value *, Value *>& ChosenPairs, 265 DenseSet<ValuePair> &FixedOrderPairs, 266 DenseMap<VPPair, unsigned> &PairConnectionTypes, 267 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 268 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps); 269 270 271 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 272 273 bool areInstsCompatible(Instruction *I, Instruction *J, 274 bool IsSimpleLoadStore, bool NonPow2Len, 275 int &CostSavings, int &FixedOrder); 276 277 bool trackUsesOfI(DenseSet<Value *> &Users, 278 AliasSetTracker &WriteSet, Instruction *I, 279 Instruction *J, bool UpdateUsers = true, 280 std::multimap<Value *, Value *> *LoadMoveSet = 0); 281 282 void computePairsConnectedTo( 283 std::multimap<Value *, Value *> &CandidatePairs, 284 std::vector<Value *> &PairableInsts, 285 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 286 DenseMap<VPPair, unsigned> &PairConnectionTypes, 287 ValuePair P); 288 289 bool pairsConflict(ValuePair P, ValuePair Q, 290 DenseSet<ValuePair> &PairableInstUsers, 291 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); 292 293 bool pairWillFormCycle(ValuePair P, 294 std::multimap<ValuePair, ValuePair> &PairableInstUsers, 295 DenseSet<ValuePair> &CurrentPairs); 296 297 void pruneTreeFor( 298 std::multimap<Value *, Value *> &CandidatePairs, 299 std::vector<Value *> &PairableInsts, 300 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 301 DenseSet<ValuePair> &PairableInstUsers, 302 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 303 DenseMap<Value *, Value *> &ChosenPairs, 304 DenseMap<ValuePair, size_t> &Tree, 305 DenseSet<ValuePair> &PrunedTree, ValuePair J, 306 bool UseCycleCheck); 307 308 void buildInitialTreeFor( 309 std::multimap<Value *, Value *> &CandidatePairs, 310 std::vector<Value *> &PairableInsts, 311 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 312 DenseSet<ValuePair> &PairableInstUsers, 313 DenseMap<Value *, Value *> &ChosenPairs, 314 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 315 316 void findBestTreeFor( 317 std::multimap<Value *, Value *> &CandidatePairs, 318 DenseMap<ValuePair, int> &CandidatePairCostSavings, 319 std::vector<Value *> &PairableInsts, 320 DenseSet<ValuePair> &FixedOrderPairs, 321 DenseMap<VPPair, unsigned> &PairConnectionTypes, 322 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 323 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, 324 DenseSet<ValuePair> &PairableInstUsers, 325 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 326 DenseMap<Value *, Value *> &ChosenPairs, 327 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 328 int &BestEffSize, VPIteratorPair ChoiceRange, 329 bool UseCycleCheck); 330 331 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 332 Instruction *J, unsigned o); 333 334 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 335 unsigned MaskOffset, unsigned NumInElem, 336 unsigned NumInElem1, unsigned IdxOffset, 337 std::vector<Constant*> &Mask); 338 339 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 340 Instruction *J); 341 342 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, 343 unsigned o, Value *&LOp, unsigned numElemL, 344 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, 345 unsigned IdxOff = 0); 346 347 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 348 Instruction *J, unsigned o, bool IBeforeJ); 349 350 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 351 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 352 bool IBeforeJ); 353 354 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 355 Instruction *J, Instruction *K, 356 Instruction *&InsertionPt, Instruction *&K1, 357 Instruction *&K2); 358 359 void collectPairLoadMoveSet(BasicBlock &BB, 360 DenseMap<Value *, Value *> &ChosenPairs, 361 std::multimap<Value *, Value *> &LoadMoveSet, 362 Instruction *I); 363 364 void collectLoadMoveSet(BasicBlock &BB, 365 std::vector<Value *> &PairableInsts, 366 DenseMap<Value *, Value *> &ChosenPairs, 367 std::multimap<Value *, Value *> &LoadMoveSet); 368 369 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 370 std::multimap<Value *, Value *> &LoadMoveSet, 371 Instruction *I, Instruction *J); 372 373 void moveUsesOfIAfterJ(BasicBlock &BB, 374 std::multimap<Value *, Value *> &LoadMoveSet, 375 Instruction *&InsertionPt, 376 Instruction *I, Instruction *J); 377 378 void combineMetadata(Instruction *K, const Instruction *J); 379 380 bool vectorizeBB(BasicBlock &BB) { 381 if (!DT->isReachableFromEntry(&BB)) { 382 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << 383 " in " << BB.getParent()->getName() << "\n"); 384 return false; 385 } 386 387 DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); 388 389 bool changed = false; 390 // Iterate a sufficient number of times to merge types of size 1 bit, 391 // then 2 bits, then 4, etc. up to half of the target vector width of the 392 // target vector register. 393 unsigned n = 1; 394 for (unsigned v = 2; 395 (TTI || v <= Config.VectorBits) && 396 (!Config.MaxIter || n <= Config.MaxIter); 397 v *= 2, ++n) { 398 DEBUG(dbgs() << "BBV: fusing loop #" << n << 399 " for " << BB.getName() << " in " << 400 BB.getParent()->getName() << "...\n"); 401 if (vectorizePairs(BB)) 402 changed = true; 403 else 404 break; 405 } 406 407 if (changed && !Pow2LenOnly) { 408 ++n; 409 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { 410 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << 411 n << " for " << BB.getName() << " in " << 412 BB.getParent()->getName() << "...\n"); 413 if (!vectorizePairs(BB, true)) break; 414 } 415 } 416 417 DEBUG(dbgs() << "BBV: done!\n"); 418 return changed; 419 } 420 421 virtual bool runOnBasicBlock(BasicBlock &BB) { 422 AA = &getAnalysis<AliasAnalysis>(); 423 DT = &getAnalysis<DominatorTree>(); 424 SE = &getAnalysis<ScalarEvolution>(); 425 TD = getAnalysisIfAvailable<DataLayout>(); 426 TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); 427 428 return vectorizeBB(BB); 429 } 430 431 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 432 BasicBlockPass::getAnalysisUsage(AU); 433 AU.addRequired<AliasAnalysis>(); 434 AU.addRequired<DominatorTree>(); 435 AU.addRequired<ScalarEvolution>(); 436 AU.addRequired<TargetTransformInfo>(); 437 AU.addPreserved<AliasAnalysis>(); 438 AU.addPreserved<DominatorTree>(); 439 AU.addPreserved<ScalarEvolution>(); 440 AU.setPreservesCFG(); 441 } 442 443 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { 444 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && 445 "Cannot form vector from incompatible scalar types"); 446 Type *STy = ElemTy->getScalarType(); 447 448 unsigned numElem; 449 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 450 numElem = VTy->getNumElements(); 451 } else { 452 numElem = 1; 453 } 454 455 if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { 456 numElem += VTy->getNumElements(); 457 } else { 458 numElem += 1; 459 } 460 461 return VectorType::get(STy, numElem); 462 } 463 464 static inline void getInstructionTypes(Instruction *I, 465 Type *&T1, Type *&T2) { 466 if (isa<StoreInst>(I)) { 467 // For stores, it is the value type, not the pointer type that matters 468 // because the value is what will come from a vector register. 469 470 Value *IVal = cast<StoreInst>(I)->getValueOperand(); 471 T1 = IVal->getType(); 472 } else { 473 T1 = I->getType(); 474 } 475 476 if (I->isCast()) 477 T2 = cast<CastInst>(I)->getSrcTy(); 478 else 479 T2 = T1; 480 481 if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 482 T2 = SI->getCondition()->getType(); 483 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 484 T2 = SI->getOperand(0)->getType(); 485 } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { 486 T2 = CI->getOperand(0)->getType(); 487 } 488 } 489 490 // Returns the weight associated with the provided value. A chain of 491 // candidate pairs has a length given by the sum of the weights of its 492 // members (one weight per pair; the weight of each member of the pair 493 // is assumed to be the same). This length is then compared to the 494 // chain-length threshold to determine if a given chain is significant 495 // enough to be vectorized. The length is also used in comparing 496 // candidate chains where longer chains are considered to be better. 497 // Note: when this function returns 0, the resulting instructions are 498 // not actually fused. 499 inline size_t getDepthFactor(Value *V) { 500 // InsertElement and ExtractElement have a depth factor of zero. This is 501 // for two reasons: First, they cannot be usefully fused. Second, because 502 // the pass generates a lot of these, they can confuse the simple metric 503 // used to compare the trees in the next iteration. Thus, giving them a 504 // weight of zero allows the pass to essentially ignore them in 505 // subsequent iterations when looking for vectorization opportunities 506 // while still tracking dependency chains that flow through those 507 // instructions. 508 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 509 return 0; 510 511 // Give a load or store half of the required depth so that load/store 512 // pairs will vectorize. 513 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 514 return Config.ReqChainDepth/2; 515 516 return 1; 517 } 518 519 // Returns the cost of the provided instruction using TTI. 520 // This does not handle loads and stores. 521 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { 522 switch (Opcode) { 523 default: break; 524 case Instruction::GetElementPtr: 525 // We mark this instruction as zero-cost because scalar GEPs are usually 526 // lowered to the intruction addressing mode. At the moment we don't 527 // generate vector GEPs. 528 return 0; 529 case Instruction::Br: 530 return TTI->getCFInstrCost(Opcode); 531 case Instruction::PHI: 532 return 0; 533 case Instruction::Add: 534 case Instruction::FAdd: 535 case Instruction::Sub: 536 case Instruction::FSub: 537 case Instruction::Mul: 538 case Instruction::FMul: 539 case Instruction::UDiv: 540 case Instruction::SDiv: 541 case Instruction::FDiv: 542 case Instruction::URem: 543 case Instruction::SRem: 544 case Instruction::FRem: 545 case Instruction::Shl: 546 case Instruction::LShr: 547 case Instruction::AShr: 548 case Instruction::And: 549 case Instruction::Or: 550 case Instruction::Xor: 551 return TTI->getArithmeticInstrCost(Opcode, T1); 552 case Instruction::Select: 553 case Instruction::ICmp: 554 case Instruction::FCmp: 555 return TTI->getCmpSelInstrCost(Opcode, T1, T2); 556 case Instruction::ZExt: 557 case Instruction::SExt: 558 case Instruction::FPToUI: 559 case Instruction::FPToSI: 560 case Instruction::FPExt: 561 case Instruction::PtrToInt: 562 case Instruction::IntToPtr: 563 case Instruction::SIToFP: 564 case Instruction::UIToFP: 565 case Instruction::Trunc: 566 case Instruction::FPTrunc: 567 case Instruction::BitCast: 568 case Instruction::ShuffleVector: 569 return TTI->getCastInstrCost(Opcode, T1, T2); 570 } 571 572 return 1; 573 } 574 575 // This determines the relative offset of two loads or stores, returning 576 // true if the offset could be determined to be some constant value. 577 // For example, if OffsetInElmts == 1, then J accesses the memory directly 578 // after I; if OffsetInElmts == -1 then I accesses the memory 579 // directly after J. 580 bool getPairPtrInfo(Instruction *I, Instruction *J, 581 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 582 unsigned &IAddressSpace, unsigned &JAddressSpace, 583 int64_t &OffsetInElmts, bool ComputeOffset = true) { 584 OffsetInElmts = 0; 585 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 586 LoadInst *LJ = cast<LoadInst>(J); 587 IPtr = LI->getPointerOperand(); 588 JPtr = LJ->getPointerOperand(); 589 IAlignment = LI->getAlignment(); 590 JAlignment = LJ->getAlignment(); 591 IAddressSpace = LI->getPointerAddressSpace(); 592 JAddressSpace = LJ->getPointerAddressSpace(); 593 } else { 594 StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); 595 IPtr = SI->getPointerOperand(); 596 JPtr = SJ->getPointerOperand(); 597 IAlignment = SI->getAlignment(); 598 JAlignment = SJ->getAlignment(); 599 IAddressSpace = SI->getPointerAddressSpace(); 600 JAddressSpace = SJ->getPointerAddressSpace(); 601 } 602 603 if (!ComputeOffset) 604 return true; 605 606 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 607 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 608 609 // If this is a trivial offset, then we'll get something like 610 // 1*sizeof(type). With target data, which we need anyway, this will get 611 // constant folded into a number. 612 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 613 if (const SCEVConstant *ConstOffSCEV = 614 dyn_cast<SCEVConstant>(OffsetSCEV)) { 615 ConstantInt *IntOff = ConstOffSCEV->getValue(); 616 int64_t Offset = IntOff->getSExtValue(); 617 618 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 619 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 620 621 Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); 622 if (VTy != VTy2 && Offset < 0) { 623 int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); 624 OffsetInElmts = Offset/VTy2TSS; 625 return (abs64(Offset) % VTy2TSS) == 0; 626 } 627 628 OffsetInElmts = Offset/VTyTSS; 629 return (abs64(Offset) % VTyTSS) == 0; 630 } 631 632 return false; 633 } 634 635 // Returns true if the provided CallInst represents an intrinsic that can 636 // be vectorized. 637 bool isVectorizableIntrinsic(CallInst* I) { 638 Function *F = I->getCalledFunction(); 639 if (!F) return false; 640 641 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 642 if (!IID) return false; 643 644 switch(IID) { 645 default: 646 return false; 647 case Intrinsic::sqrt: 648 case Intrinsic::powi: 649 case Intrinsic::sin: 650 case Intrinsic::cos: 651 case Intrinsic::log: 652 case Intrinsic::log2: 653 case Intrinsic::log10: 654 case Intrinsic::exp: 655 case Intrinsic::exp2: 656 case Intrinsic::pow: 657 return Config.VectorizeMath; 658 case Intrinsic::fma: 659 case Intrinsic::fmuladd: 660 return Config.VectorizeFMA; 661 } 662 } 663 664 // Returns true if J is the second element in some pair referenced by 665 // some multimap pair iterator pair. 666 template <typename V> 667 bool isSecondInIteratorPair(V J, std::pair< 668 typename std::multimap<V, V>::iterator, 669 typename std::multimap<V, V>::iterator> PairRange) { 670 for (typename std::multimap<V, V>::iterator K = PairRange.first; 671 K != PairRange.second; ++K) 672 if (K->second == J) return true; 673 674 return false; 675 } 676 677 bool isPureIEChain(InsertElementInst *IE) { 678 InsertElementInst *IENext = IE; 679 do { 680 if (!isa<UndefValue>(IENext->getOperand(0)) && 681 !isa<InsertElementInst>(IENext->getOperand(0))) { 682 return false; 683 } 684 } while ((IENext = 685 dyn_cast<InsertElementInst>(IENext->getOperand(0)))); 686 687 return true; 688 } 689 }; 690 691 // This function implements one vectorization iteration on the provided 692 // basic block. It returns true if the block is changed. 693 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { 694 bool ShouldContinue; 695 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 696 697 std::vector<Value *> AllPairableInsts; 698 DenseMap<Value *, Value *> AllChosenPairs; 699 DenseSet<ValuePair> AllFixedOrderPairs; 700 DenseMap<VPPair, unsigned> AllPairConnectionTypes; 701 std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps; 702 703 do { 704 std::vector<Value *> PairableInsts; 705 std::multimap<Value *, Value *> CandidatePairs; 706 DenseSet<ValuePair> FixedOrderPairs; 707 DenseMap<ValuePair, int> CandidatePairCostSavings; 708 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 709 FixedOrderPairs, 710 CandidatePairCostSavings, 711 PairableInsts, NonPow2Len); 712 if (PairableInsts.empty()) continue; 713 714 // Now we have a map of all of the pairable instructions and we need to 715 // select the best possible pairing. A good pairing is one such that the 716 // users of the pair are also paired. This defines a (directed) forest 717 // over the pairs such that two pairs are connected iff the second pair 718 // uses the first. 719 720 // Note that it only matters that both members of the second pair use some 721 // element of the first pair (to allow for splatting). 722 723 std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps; 724 DenseMap<VPPair, unsigned> PairConnectionTypes; 725 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs, 726 PairConnectionTypes); 727 if (ConnectedPairs.empty()) continue; 728 729 for (std::multimap<ValuePair, ValuePair>::iterator 730 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 731 I != IE; ++I) { 732 ConnectedPairDeps.insert(VPPair(I->second, I->first)); 733 } 734 735 // Build the pairable-instruction dependency map 736 DenseSet<ValuePair> PairableInstUsers; 737 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 738 739 // There is now a graph of the connected pairs. For each variable, pick 740 // the pairing with the largest tree meeting the depth requirement on at 741 // least one branch. Then select all pairings that are part of that tree 742 // and remove them from the list of available pairings and pairable 743 // variables. 744 745 DenseMap<Value *, Value *> ChosenPairs; 746 choosePairs(CandidatePairs, CandidatePairCostSavings, 747 PairableInsts, FixedOrderPairs, PairConnectionTypes, 748 ConnectedPairs, ConnectedPairDeps, 749 PairableInstUsers, ChosenPairs); 750 751 if (ChosenPairs.empty()) continue; 752 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 753 PairableInsts.end()); 754 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 755 756 // Only for the chosen pairs, propagate information on fixed-order pairs, 757 // pair connections, and their types to the data structures used by the 758 // pair fusion procedures. 759 for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), 760 IE = ChosenPairs.end(); I != IE; ++I) { 761 if (FixedOrderPairs.count(*I)) 762 AllFixedOrderPairs.insert(*I); 763 else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) 764 AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); 765 766 for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); 767 J != IE; ++J) { 768 DenseMap<VPPair, unsigned>::iterator K = 769 PairConnectionTypes.find(VPPair(*I, *J)); 770 if (K != PairConnectionTypes.end()) { 771 AllPairConnectionTypes.insert(*K); 772 } else { 773 K = PairConnectionTypes.find(VPPair(*J, *I)); 774 if (K != PairConnectionTypes.end()) 775 AllPairConnectionTypes.insert(*K); 776 } 777 } 778 } 779 780 for (std::multimap<ValuePair, ValuePair>::iterator 781 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 782 I != IE; ++I) { 783 if (AllPairConnectionTypes.count(*I)) { 784 AllConnectedPairs.insert(*I); 785 AllConnectedPairDeps.insert(VPPair(I->second, I->first)); 786 } 787 } 788 } while (ShouldContinue); 789 790 if (AllChosenPairs.empty()) return false; 791 NumFusedOps += AllChosenPairs.size(); 792 793 // A set of pairs has now been selected. It is now necessary to replace the 794 // paired instructions with vector instructions. For this procedure each 795 // operand must be replaced with a vector operand. This vector is formed 796 // by using build_vector on the old operands. The replaced values are then 797 // replaced with a vector_extract on the result. Subsequent optimization 798 // passes should coalesce the build/extract combinations. 799 800 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, 801 AllPairConnectionTypes, 802 AllConnectedPairs, AllConnectedPairDeps); 803 804 // It is important to cleanup here so that future iterations of this 805 // function have less work to do. 806 (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo()); 807 return true; 808 } 809 810 // This function returns true if the provided instruction is capable of being 811 // fused into a vector instruction. This determination is based only on the 812 // type and other attributes of the instruction. 813 bool BBVectorize::isInstVectorizable(Instruction *I, 814 bool &IsSimpleLoadStore) { 815 IsSimpleLoadStore = false; 816 817 if (CallInst *C = dyn_cast<CallInst>(I)) { 818 if (!isVectorizableIntrinsic(C)) 819 return false; 820 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 821 // Vectorize simple loads if possbile: 822 IsSimpleLoadStore = L->isSimple(); 823 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 824 return false; 825 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 826 // Vectorize simple stores if possbile: 827 IsSimpleLoadStore = S->isSimple(); 828 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 829 return false; 830 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 831 // We can vectorize casts, but not casts of pointer types, etc. 832 if (!Config.VectorizeCasts) 833 return false; 834 835 Type *SrcTy = C->getSrcTy(); 836 if (!SrcTy->isSingleValueType()) 837 return false; 838 839 Type *DestTy = C->getDestTy(); 840 if (!DestTy->isSingleValueType()) 841 return false; 842 } else if (isa<SelectInst>(I)) { 843 if (!Config.VectorizeSelect) 844 return false; 845 } else if (isa<CmpInst>(I)) { 846 if (!Config.VectorizeCmp) 847 return false; 848 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 849 if (!Config.VectorizeGEP) 850 return false; 851 852 // Currently, vector GEPs exist only with one index. 853 if (G->getNumIndices() != 1) 854 return false; 855 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 856 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 857 return false; 858 } 859 860 // We can't vectorize memory operations without target data 861 if (TD == 0 && IsSimpleLoadStore) 862 return false; 863 864 Type *T1, *T2; 865 getInstructionTypes(I, T1, T2); 866 867 // Not every type can be vectorized... 868 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 869 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 870 return false; 871 872 if (T1->getScalarSizeInBits() == 1) { 873 if (!Config.VectorizeBools) 874 return false; 875 } else { 876 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) 877 return false; 878 } 879 880 if (T2->getScalarSizeInBits() == 1) { 881 if (!Config.VectorizeBools) 882 return false; 883 } else { 884 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) 885 return false; 886 } 887 888 if (!Config.VectorizeFloats 889 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 890 return false; 891 892 // Don't vectorize target-specific types. 893 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 894 return false; 895 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 896 return false; 897 898 if ((!Config.VectorizePointers || TD == 0) && 899 (T1->getScalarType()->isPointerTy() || 900 T2->getScalarType()->isPointerTy())) 901 return false; 902 903 if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || 904 T2->getPrimitiveSizeInBits() >= Config.VectorBits)) 905 return false; 906 907 return true; 908 } 909 910 // This function returns true if the two provided instructions are compatible 911 // (meaning that they can be fused into a vector instruction). This assumes 912 // that I has already been determined to be vectorizable and that J is not 913 // in the use tree of I. 914 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 915 bool IsSimpleLoadStore, bool NonPow2Len, 916 int &CostSavings, int &FixedOrder) { 917 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 918 " <-> " << *J << "\n"); 919 920 CostSavings = 0; 921 FixedOrder = 0; 922 923 // Loads and stores can be merged if they have different alignments, 924 // but are otherwise the same. 925 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | 926 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) 927 return false; 928 929 Type *IT1, *IT2, *JT1, *JT2; 930 getInstructionTypes(I, IT1, IT2); 931 getInstructionTypes(J, JT1, JT2); 932 unsigned MaxTypeBits = std::max( 933 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), 934 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); 935 if (!TTI && MaxTypeBits > Config.VectorBits) 936 return false; 937 938 // FIXME: handle addsub-type operations! 939 940 if (IsSimpleLoadStore) { 941 Value *IPtr, *JPtr; 942 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 943 int64_t OffsetInElmts = 0; 944 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 945 IAddressSpace, JAddressSpace, 946 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 947 FixedOrder = (int) OffsetInElmts; 948 unsigned BottomAlignment = IAlignment; 949 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 950 951 Type *aTypeI = isa<StoreInst>(I) ? 952 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 953 Type *aTypeJ = isa<StoreInst>(J) ? 954 cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); 955 Type *VType = getVecTypeForPair(aTypeI, aTypeJ); 956 957 if (Config.AlignedOnly) { 958 // An aligned load or store is possible only if the instruction 959 // with the lower offset has an alignment suitable for the 960 // vector type. 961 962 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 963 if (BottomAlignment < VecAlignment) 964 return false; 965 } 966 967 if (TTI) { 968 unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, 969 IAlignment, IAddressSpace); 970 unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, 971 JAlignment, JAddressSpace); 972 unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, 973 BottomAlignment, 974 IAddressSpace); 975 if (VCost > ICost + JCost) 976 return false; 977 978 // We don't want to fuse to a type that will be split, even 979 // if the two input types will also be split and there is no other 980 // associated cost. 981 unsigned VParts = TTI->getNumberOfParts(VType); 982 if (VParts > 1) 983 return false; 984 else if (!VParts && VCost == ICost + JCost) 985 return false; 986 987 CostSavings = ICost + JCost - VCost; 988 } 989 } else { 990 return false; 991 } 992 } else if (TTI) { 993 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); 994 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); 995 Type *VT1 = getVecTypeForPair(IT1, JT1), 996 *VT2 = getVecTypeForPair(IT2, JT2); 997 998 // Note that this procedure is incorrect for insert and extract element 999 // instructions (because combining these often results in a shuffle), 1000 // but this cost is ignored (because insert and extract element 1001 // instructions are assigned a zero depth factor and are not really 1002 // fused in general). 1003 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); 1004 1005 if (VCost > ICost + JCost) 1006 return false; 1007 1008 // We don't want to fuse to a type that will be split, even 1009 // if the two input types will also be split and there is no other 1010 // associated cost. 1011 unsigned VParts1 = TTI->getNumberOfParts(VT1), 1012 VParts2 = TTI->getNumberOfParts(VT2); 1013 if (VParts1 > 1 || VParts2 > 1) 1014 return false; 1015 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) 1016 return false; 1017 1018 CostSavings = ICost + JCost - VCost; 1019 } 1020 1021 // The powi intrinsic is special because only the first argument is 1022 // vectorized, the second arguments must be equal. 1023 CallInst *CI = dyn_cast<CallInst>(I); 1024 Function *FI; 1025 if (CI && (FI = CI->getCalledFunction())) { 1026 Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); 1027 if (IID == Intrinsic::powi) { 1028 Value *A1I = CI->getArgOperand(1), 1029 *A1J = cast<CallInst>(J)->getArgOperand(1); 1030 const SCEV *A1ISCEV = SE->getSCEV(A1I), 1031 *A1JSCEV = SE->getSCEV(A1J); 1032 return (A1ISCEV == A1JSCEV); 1033 } 1034 1035 if (IID && TTI) { 1036 SmallVector<Type*, 4> Tys; 1037 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 1038 Tys.push_back(CI->getArgOperand(i)->getType()); 1039 unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); 1040 1041 Tys.clear(); 1042 CallInst *CJ = cast<CallInst>(J); 1043 for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) 1044 Tys.push_back(CJ->getArgOperand(i)->getType()); 1045 unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); 1046 1047 Tys.clear(); 1048 assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && 1049 "Intrinsic argument counts differ"); 1050 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1051 if (IID == Intrinsic::powi && i == 1) 1052 Tys.push_back(CI->getArgOperand(i)->getType()); 1053 else 1054 Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), 1055 CJ->getArgOperand(i)->getType())); 1056 } 1057 1058 Type *RetTy = getVecTypeForPair(IT1, JT1); 1059 unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); 1060 1061 if (VCost > ICost + JCost) 1062 return false; 1063 1064 // We don't want to fuse to a type that will be split, even 1065 // if the two input types will also be split and there is no other 1066 // associated cost. 1067 unsigned RetParts = TTI->getNumberOfParts(RetTy); 1068 if (RetParts > 1) 1069 return false; 1070 else if (!RetParts && VCost == ICost + JCost) 1071 return false; 1072 1073 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1074 if (!Tys[i]->isVectorTy()) 1075 continue; 1076 1077 unsigned NumParts = TTI->getNumberOfParts(Tys[i]); 1078 if (NumParts > 1) 1079 return false; 1080 else if (!NumParts && VCost == ICost + JCost) 1081 return false; 1082 } 1083 1084 CostSavings = ICost + JCost - VCost; 1085 } 1086 } 1087 1088 return true; 1089 } 1090 1091 // Figure out whether or not J uses I and update the users and write-set 1092 // structures associated with I. Specifically, Users represents the set of 1093 // instructions that depend on I. WriteSet represents the set 1094 // of memory locations that are dependent on I. If UpdateUsers is true, 1095 // and J uses I, then Users is updated to contain J and WriteSet is updated 1096 // to contain any memory locations to which J writes. The function returns 1097 // true if J uses I. By default, alias analysis is used to determine 1098 // whether J reads from memory that overlaps with a location in WriteSet. 1099 // If LoadMoveSet is not null, then it is a previously-computed multimap 1100 // where the key is the memory-based user instruction and the value is 1101 // the instruction to be compared with I. So, if LoadMoveSet is provided, 1102 // then the alias analysis is not used. This is necessary because this 1103 // function is called during the process of moving instructions during 1104 // vectorization and the results of the alias analysis are not stable during 1105 // that process. 1106 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 1107 AliasSetTracker &WriteSet, Instruction *I, 1108 Instruction *J, bool UpdateUsers, 1109 std::multimap<Value *, Value *> *LoadMoveSet) { 1110 bool UsesI = false; 1111 1112 // This instruction may already be marked as a user due, for example, to 1113 // being a member of a selected pair. 1114 if (Users.count(J)) 1115 UsesI = true; 1116 1117 if (!UsesI) 1118 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 1119 JU != JE; ++JU) { 1120 Value *V = *JU; 1121 if (I == V || Users.count(V)) { 1122 UsesI = true; 1123 break; 1124 } 1125 } 1126 if (!UsesI && J->mayReadFromMemory()) { 1127 if (LoadMoveSet) { 1128 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); 1129 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); 1130 } else { 1131 for (AliasSetTracker::iterator W = WriteSet.begin(), 1132 WE = WriteSet.end(); W != WE; ++W) { 1133 if (W->aliasesUnknownInst(J, *AA)) { 1134 UsesI = true; 1135 break; 1136 } 1137 } 1138 } 1139 } 1140 1141 if (UsesI && UpdateUsers) { 1142 if (J->mayWriteToMemory()) WriteSet.add(J); 1143 Users.insert(J); 1144 } 1145 1146 return UsesI; 1147 } 1148 1149 // This function iterates over all instruction pairs in the provided 1150 // basic block and collects all candidate pairs for vectorization. 1151 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 1152 BasicBlock::iterator &Start, 1153 std::multimap<Value *, Value *> &CandidatePairs, 1154 DenseSet<ValuePair> &FixedOrderPairs, 1155 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1156 std::vector<Value *> &PairableInsts, bool NonPow2Len) { 1157 BasicBlock::iterator E = BB.end(); 1158 if (Start == E) return false; 1159 1160 bool ShouldContinue = false, IAfterStart = false; 1161 for (BasicBlock::iterator I = Start++; I != E; ++I) { 1162 if (I == Start) IAfterStart = true; 1163 1164 bool IsSimpleLoadStore; 1165 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 1166 1167 // Look for an instruction with which to pair instruction *I... 1168 DenseSet<Value *> Users; 1169 AliasSetTracker WriteSet(*AA); 1170 bool JAfterStart = IAfterStart; 1171 BasicBlock::iterator J = llvm::next(I); 1172 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 1173 if (J == Start) JAfterStart = true; 1174 1175 // Determine if J uses I, if so, exit the loop. 1176 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 1177 if (Config.FastDep) { 1178 // Note: For this heuristic to be effective, independent operations 1179 // must tend to be intermixed. This is likely to be true from some 1180 // kinds of grouped loop unrolling (but not the generic LLVM pass), 1181 // but otherwise may require some kind of reordering pass. 1182 1183 // When using fast dependency analysis, 1184 // stop searching after first use: 1185 if (UsesI) break; 1186 } else { 1187 if (UsesI) continue; 1188 } 1189 1190 // J does not use I, and comes before the first use of I, so it can be 1191 // merged with I if the instructions are compatible. 1192 int CostSavings, FixedOrder; 1193 if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, 1194 CostSavings, FixedOrder)) continue; 1195 1196 // J is a candidate for merging with I. 1197 if (!PairableInsts.size() || 1198 PairableInsts[PairableInsts.size()-1] != I) { 1199 PairableInsts.push_back(I); 1200 } 1201 1202 CandidatePairs.insert(ValuePair(I, J)); 1203 if (TTI) 1204 CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), 1205 CostSavings)); 1206 1207 if (FixedOrder == 1) 1208 FixedOrderPairs.insert(ValuePair(I, J)); 1209 else if (FixedOrder == -1) 1210 FixedOrderPairs.insert(ValuePair(J, I)); 1211 1212 // The next call to this function must start after the last instruction 1213 // selected during this invocation. 1214 if (JAfterStart) { 1215 Start = llvm::next(J); 1216 IAfterStart = JAfterStart = false; 1217 } 1218 1219 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 1220 << *I << " <-> " << *J << " (cost savings: " << 1221 CostSavings << ")\n"); 1222 1223 // If we have already found too many pairs, break here and this function 1224 // will be called again starting after the last instruction selected 1225 // during this invocation. 1226 if (PairableInsts.size() >= Config.MaxInsts) { 1227 ShouldContinue = true; 1228 break; 1229 } 1230 } 1231 1232 if (ShouldContinue) 1233 break; 1234 } 1235 1236 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 1237 << " instructions with candidate pairs\n"); 1238 1239 return ShouldContinue; 1240 } 1241 1242 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 1243 // it looks for pairs such that both members have an input which is an 1244 // output of PI or PJ. 1245 void BBVectorize::computePairsConnectedTo( 1246 std::multimap<Value *, Value *> &CandidatePairs, 1247 std::vector<Value *> &PairableInsts, 1248 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1249 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1250 ValuePair P) { 1251 StoreInst *SI, *SJ; 1252 1253 // For each possible pairing for this variable, look at the uses of 1254 // the first value... 1255 for (Value::use_iterator I = P.first->use_begin(), 1256 E = P.first->use_end(); I != E; ++I) { 1257 if (isa<LoadInst>(*I)) { 1258 // A pair cannot be connected to a load because the load only takes one 1259 // operand (the address) and it is a scalar even after vectorization. 1260 continue; 1261 } else if ((SI = dyn_cast<StoreInst>(*I)) && 1262 P.first == SI->getPointerOperand()) { 1263 // Similarly, a pair cannot be connected to a store through its 1264 // pointer operand. 1265 continue; 1266 } 1267 1268 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 1269 1270 // For each use of the first variable, look for uses of the second 1271 // variable... 1272 for (Value::use_iterator J = P.second->use_begin(), 1273 E2 = P.second->use_end(); J != E2; ++J) { 1274 if ((SJ = dyn_cast<StoreInst>(*J)) && 1275 P.second == SJ->getPointerOperand()) 1276 continue; 1277 1278 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); 1279 1280 // Look for <I, J>: 1281 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { 1282 VPPair VP(P, ValuePair(*I, *J)); 1283 ConnectedPairs.insert(VP); 1284 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); 1285 } 1286 1287 // Look for <J, I>: 1288 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) { 1289 VPPair VP(P, ValuePair(*J, *I)); 1290 ConnectedPairs.insert(VP); 1291 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); 1292 } 1293 } 1294 1295 if (Config.SplatBreaksChain) continue; 1296 // Look for cases where just the first value in the pair is used by 1297 // both members of another pair (splatting). 1298 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 1299 if ((SJ = dyn_cast<StoreInst>(*J)) && 1300 P.first == SJ->getPointerOperand()) 1301 continue; 1302 1303 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { 1304 VPPair VP(P, ValuePair(*I, *J)); 1305 ConnectedPairs.insert(VP); 1306 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1307 } 1308 } 1309 } 1310 1311 if (Config.SplatBreaksChain) return; 1312 // Look for cases where just the second value in the pair is used by 1313 // both members of another pair (splatting). 1314 for (Value::use_iterator I = P.second->use_begin(), 1315 E = P.second->use_end(); I != E; ++I) { 1316 if (isa<LoadInst>(*I)) 1317 continue; 1318 else if ((SI = dyn_cast<StoreInst>(*I)) && 1319 P.second == SI->getPointerOperand()) 1320 continue; 1321 1322 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 1323 1324 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 1325 if ((SJ = dyn_cast<StoreInst>(*J)) && 1326 P.second == SJ->getPointerOperand()) 1327 continue; 1328 1329 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { 1330 VPPair VP(P, ValuePair(*I, *J)); 1331 ConnectedPairs.insert(VP); 1332 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1333 } 1334 } 1335 } 1336 } 1337 1338 // This function figures out which pairs are connected. Two pairs are 1339 // connected if some output of the first pair forms an input to both members 1340 // of the second pair. 1341 void BBVectorize::computeConnectedPairs( 1342 std::multimap<Value *, Value *> &CandidatePairs, 1343 std::vector<Value *> &PairableInsts, 1344 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1345 DenseMap<VPPair, unsigned> &PairConnectionTypes) { 1346 1347 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1348 PE = PairableInsts.end(); PI != PE; ++PI) { 1349 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); 1350 1351 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; 1352 P != choiceRange.second; ++P) 1353 computePairsConnectedTo(CandidatePairs, PairableInsts, 1354 ConnectedPairs, PairConnectionTypes, *P); 1355 } 1356 1357 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() 1358 << " pair connections.\n"); 1359 } 1360 1361 // This function builds a set of use tuples such that <A, B> is in the set 1362 // if B is in the use tree of A. If B is in the use tree of A, then B 1363 // depends on the output of A. 1364 void BBVectorize::buildDepMap( 1365 BasicBlock &BB, 1366 std::multimap<Value *, Value *> &CandidatePairs, 1367 std::vector<Value *> &PairableInsts, 1368 DenseSet<ValuePair> &PairableInstUsers) { 1369 DenseSet<Value *> IsInPair; 1370 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), 1371 E = CandidatePairs.end(); C != E; ++C) { 1372 IsInPair.insert(C->first); 1373 IsInPair.insert(C->second); 1374 } 1375 1376 // Iterate through the basic block, recording all users of each 1377 // pairable instruction. 1378 1379 BasicBlock::iterator E = BB.end(); 1380 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 1381 if (IsInPair.find(I) == IsInPair.end()) continue; 1382 1383 DenseSet<Value *> Users; 1384 AliasSetTracker WriteSet(*AA); 1385 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) 1386 (void) trackUsesOfI(Users, WriteSet, I, J); 1387 1388 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 1389 U != E; ++U) 1390 PairableInstUsers.insert(ValuePair(I, *U)); 1391 } 1392 } 1393 1394 // Returns true if an input to pair P is an output of pair Q and also an 1395 // input of pair Q is an output of pair P. If this is the case, then these 1396 // two pairs cannot be simultaneously fused. 1397 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 1398 DenseSet<ValuePair> &PairableInstUsers, 1399 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { 1400 // Two pairs are in conflict if they are mutual Users of eachother. 1401 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1402 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1403 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1404 PairableInstUsers.count(ValuePair(P.second, Q.second)); 1405 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1406 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1407 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1408 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1409 if (PairableInstUserMap) { 1410 // FIXME: The expensive part of the cycle check is not so much the cycle 1411 // check itself but this edge insertion procedure. This needs some 1412 // profiling and probably a different data structure (same is true of 1413 // most uses of std::multimap). 1414 if (PUsesQ) { 1415 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); 1416 if (!isSecondInIteratorPair(P, QPairRange)) 1417 PairableInstUserMap->insert(VPPair(Q, P)); 1418 } 1419 if (QUsesP) { 1420 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); 1421 if (!isSecondInIteratorPair(Q, PPairRange)) 1422 PairableInstUserMap->insert(VPPair(P, Q)); 1423 } 1424 } 1425 1426 return (QUsesP && PUsesQ); 1427 } 1428 1429 // This function walks the use graph of current pairs to see if, starting 1430 // from P, the walk returns to P. 1431 bool BBVectorize::pairWillFormCycle(ValuePair P, 1432 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1433 DenseSet<ValuePair> &CurrentPairs) { 1434 DEBUG(if (DebugCycleCheck) 1435 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1436 << *P.second << "\n"); 1437 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1438 // contains non-direct associations. 1439 DenseSet<ValuePair> Visited; 1440 SmallVector<ValuePair, 32> Q; 1441 // General depth-first post-order traversal: 1442 Q.push_back(P); 1443 do { 1444 ValuePair QTop = Q.pop_back_val(); 1445 Visited.insert(QTop); 1446 1447 DEBUG(if (DebugCycleCheck) 1448 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1449 << *QTop.second << "\n"); 1450 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); 1451 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; 1452 C != QPairRange.second; ++C) { 1453 if (C->second == P) { 1454 DEBUG(dbgs() 1455 << "BBV: rejected to prevent non-trivial cycle formation: " 1456 << *C->first.first << " <-> " << *C->first.second << "\n"); 1457 return true; 1458 } 1459 1460 if (CurrentPairs.count(C->second) && !Visited.count(C->second)) 1461 Q.push_back(C->second); 1462 } 1463 } while (!Q.empty()); 1464 1465 return false; 1466 } 1467 1468 // This function builds the initial tree of connected pairs with the 1469 // pair J at the root. 1470 void BBVectorize::buildInitialTreeFor( 1471 std::multimap<Value *, Value *> &CandidatePairs, 1472 std::vector<Value *> &PairableInsts, 1473 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1474 DenseSet<ValuePair> &PairableInstUsers, 1475 DenseMap<Value *, Value *> &ChosenPairs, 1476 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1477 // Each of these pairs is viewed as the root node of a Tree. The Tree 1478 // is then walked (depth-first). As this happens, we keep track of 1479 // the pairs that compose the Tree and the maximum depth of the Tree. 1480 SmallVector<ValuePairWithDepth, 32> Q; 1481 // General depth-first post-order traversal: 1482 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1483 do { 1484 ValuePairWithDepth QTop = Q.back(); 1485 1486 // Push each child onto the queue: 1487 bool MoreChildren = false; 1488 size_t MaxChildDepth = QTop.second; 1489 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); 1490 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; 1491 k != qtRange.second; ++k) { 1492 // Make sure that this child pair is still a candidate: 1493 bool IsStillCand = false; 1494 VPIteratorPair checkRange = 1495 CandidatePairs.equal_range(k->second.first); 1496 for (std::multimap<Value *, Value *>::iterator m = checkRange.first; 1497 m != checkRange.second; ++m) { 1498 if (m->second == k->second.second) { 1499 IsStillCand = true; 1500 break; 1501 } 1502 } 1503 1504 if (IsStillCand) { 1505 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); 1506 if (C == Tree.end()) { 1507 size_t d = getDepthFactor(k->second.first); 1508 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); 1509 MoreChildren = true; 1510 } else { 1511 MaxChildDepth = std::max(MaxChildDepth, C->second); 1512 } 1513 } 1514 } 1515 1516 if (!MoreChildren) { 1517 // Record the current pair as part of the Tree: 1518 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1519 Q.pop_back(); 1520 } 1521 } while (!Q.empty()); 1522 } 1523 1524 // Given some initial tree, prune it by removing conflicting pairs (pairs 1525 // that cannot be simultaneously chosen for vectorization). 1526 void BBVectorize::pruneTreeFor( 1527 std::multimap<Value *, Value *> &CandidatePairs, 1528 std::vector<Value *> &PairableInsts, 1529 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1530 DenseSet<ValuePair> &PairableInstUsers, 1531 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1532 DenseMap<Value *, Value *> &ChosenPairs, 1533 DenseMap<ValuePair, size_t> &Tree, 1534 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1535 bool UseCycleCheck) { 1536 SmallVector<ValuePairWithDepth, 32> Q; 1537 // General depth-first post-order traversal: 1538 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1539 do { 1540 ValuePairWithDepth QTop = Q.pop_back_val(); 1541 PrunedTree.insert(QTop.first); 1542 1543 // Visit each child, pruning as necessary... 1544 SmallVector<ValuePairWithDepth, 8> BestChildren; 1545 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); 1546 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; 1547 K != QTopRange.second; ++K) { 1548 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); 1549 if (C == Tree.end()) continue; 1550 1551 // This child is in the Tree, now we need to make sure it is the 1552 // best of any conflicting children. There could be multiple 1553 // conflicting children, so first, determine if we're keeping 1554 // this child, then delete conflicting children as necessary. 1555 1556 // It is also necessary to guard against pairing-induced 1557 // dependencies. Consider instructions a .. x .. y .. b 1558 // such that (a,b) are to be fused and (x,y) are to be fused 1559 // but a is an input to x and b is an output from y. This 1560 // means that y cannot be moved after b but x must be moved 1561 // after b for (a,b) to be fused. In other words, after 1562 // fusing (a,b) we have y .. a/b .. x where y is an input 1563 // to a/b and x is an output to a/b: x and y can no longer 1564 // be legally fused. To prevent this condition, we must 1565 // make sure that a child pair added to the Tree is not 1566 // both an input and output of an already-selected pair. 1567 1568 // Pairing-induced dependencies can also form from more complicated 1569 // cycles. The pair vs. pair conflicts are easy to check, and so 1570 // that is done explicitly for "fast rejection", and because for 1571 // child vs. child conflicts, we may prefer to keep the current 1572 // pair in preference to the already-selected child. 1573 DenseSet<ValuePair> CurrentPairs; 1574 1575 bool CanAdd = true; 1576 for (SmallVector<ValuePairWithDepth, 8>::iterator C2 1577 = BestChildren.begin(), E2 = BestChildren.end(); 1578 C2 != E2; ++C2) { 1579 if (C2->first.first == C->first.first || 1580 C2->first.first == C->first.second || 1581 C2->first.second == C->first.first || 1582 C2->first.second == C->first.second || 1583 pairsConflict(C2->first, C->first, PairableInstUsers, 1584 UseCycleCheck ? &PairableInstUserMap : 0)) { 1585 if (C2->second >= C->second) { 1586 CanAdd = false; 1587 break; 1588 } 1589 1590 CurrentPairs.insert(C2->first); 1591 } 1592 } 1593 if (!CanAdd) continue; 1594 1595 // Even worse, this child could conflict with another node already 1596 // selected for the Tree. If that is the case, ignore this child. 1597 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1598 E2 = PrunedTree.end(); T != E2; ++T) { 1599 if (T->first == C->first.first || 1600 T->first == C->first.second || 1601 T->second == C->first.first || 1602 T->second == C->first.second || 1603 pairsConflict(*T, C->first, PairableInstUsers, 1604 UseCycleCheck ? &PairableInstUserMap : 0)) { 1605 CanAdd = false; 1606 break; 1607 } 1608 1609 CurrentPairs.insert(*T); 1610 } 1611 if (!CanAdd) continue; 1612 1613 // And check the queue too... 1614 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1615 E2 = Q.end(); C2 != E2; ++C2) { 1616 if (C2->first.first == C->first.first || 1617 C2->first.first == C->first.second || 1618 C2->first.second == C->first.first || 1619 C2->first.second == C->first.second || 1620 pairsConflict(C2->first, C->first, PairableInstUsers, 1621 UseCycleCheck ? &PairableInstUserMap : 0)) { 1622 CanAdd = false; 1623 break; 1624 } 1625 1626 CurrentPairs.insert(C2->first); 1627 } 1628 if (!CanAdd) continue; 1629 1630 // Last but not least, check for a conflict with any of the 1631 // already-chosen pairs. 1632 for (DenseMap<Value *, Value *>::iterator C2 = 1633 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1634 C2 != E2; ++C2) { 1635 if (pairsConflict(*C2, C->first, PairableInstUsers, 1636 UseCycleCheck ? &PairableInstUserMap : 0)) { 1637 CanAdd = false; 1638 break; 1639 } 1640 1641 CurrentPairs.insert(*C2); 1642 } 1643 if (!CanAdd) continue; 1644 1645 // To check for non-trivial cycles formed by the addition of the 1646 // current pair we've formed a list of all relevant pairs, now use a 1647 // graph walk to check for a cycle. We start from the current pair and 1648 // walk the use tree to see if we again reach the current pair. If we 1649 // do, then the current pair is rejected. 1650 1651 // FIXME: It may be more efficient to use a topological-ordering 1652 // algorithm to improve the cycle check. This should be investigated. 1653 if (UseCycleCheck && 1654 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1655 continue; 1656 1657 // This child can be added, but we may have chosen it in preference 1658 // to an already-selected child. Check for this here, and if a 1659 // conflict is found, then remove the previously-selected child 1660 // before adding this one in its place. 1661 for (SmallVector<ValuePairWithDepth, 8>::iterator C2 1662 = BestChildren.begin(); C2 != BestChildren.end();) { 1663 if (C2->first.first == C->first.first || 1664 C2->first.first == C->first.second || 1665 C2->first.second == C->first.first || 1666 C2->first.second == C->first.second || 1667 pairsConflict(C2->first, C->first, PairableInstUsers)) 1668 C2 = BestChildren.erase(C2); 1669 else 1670 ++C2; 1671 } 1672 1673 BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); 1674 } 1675 1676 for (SmallVector<ValuePairWithDepth, 8>::iterator C 1677 = BestChildren.begin(), E2 = BestChildren.end(); 1678 C != E2; ++C) { 1679 size_t DepthF = getDepthFactor(C->first.first); 1680 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1681 } 1682 } while (!Q.empty()); 1683 } 1684 1685 // This function finds the best tree of mututally-compatible connected 1686 // pairs, given the choice of root pairs as an iterator range. 1687 void BBVectorize::findBestTreeFor( 1688 std::multimap<Value *, Value *> &CandidatePairs, 1689 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1690 std::vector<Value *> &PairableInsts, 1691 DenseSet<ValuePair> &FixedOrderPairs, 1692 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1693 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1694 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, 1695 DenseSet<ValuePair> &PairableInstUsers, 1696 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1697 DenseMap<Value *, Value *> &ChosenPairs, 1698 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1699 int &BestEffSize, VPIteratorPair ChoiceRange, 1700 bool UseCycleCheck) { 1701 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; 1702 J != ChoiceRange.second; ++J) { 1703 1704 // Before going any further, make sure that this pair does not 1705 // conflict with any already-selected pairs (see comment below 1706 // near the Tree pruning for more details). 1707 DenseSet<ValuePair> ChosenPairSet; 1708 bool DoesConflict = false; 1709 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1710 E = ChosenPairs.end(); C != E; ++C) { 1711 if (pairsConflict(*C, *J, PairableInstUsers, 1712 UseCycleCheck ? &PairableInstUserMap : 0)) { 1713 DoesConflict = true; 1714 break; 1715 } 1716 1717 ChosenPairSet.insert(*C); 1718 } 1719 if (DoesConflict) continue; 1720 1721 if (UseCycleCheck && 1722 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) 1723 continue; 1724 1725 DenseMap<ValuePair, size_t> Tree; 1726 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1727 PairableInstUsers, ChosenPairs, Tree, *J); 1728 1729 // Because we'll keep the child with the largest depth, the largest 1730 // depth is still the same in the unpruned Tree. 1731 size_t MaxDepth = Tree.lookup(*J); 1732 1733 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1734 << *J->first << " <-> " << *J->second << "} of depth " << 1735 MaxDepth << " and size " << Tree.size() << "\n"); 1736 1737 // At this point the Tree has been constructed, but, may contain 1738 // contradictory children (meaning that different children of 1739 // some tree node may be attempting to fuse the same instruction). 1740 // So now we walk the tree again, in the case of a conflict, 1741 // keep only the child with the largest depth. To break a tie, 1742 // favor the first child. 1743 1744 DenseSet<ValuePair> PrunedTree; 1745 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1746 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, 1747 PrunedTree, *J, UseCycleCheck); 1748 1749 int EffSize = 0; 1750 if (TTI) { 1751 DenseSet<Value *> PrunedTreeInstrs; 1752 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1753 E = PrunedTree.end(); S != E; ++S) { 1754 PrunedTreeInstrs.insert(S->first); 1755 PrunedTreeInstrs.insert(S->second); 1756 } 1757 1758 // The set of pairs that have already contributed to the total cost. 1759 DenseSet<ValuePair> IncomingPairs; 1760 1761 // If the cost model were perfect, this might not be necessary; but we 1762 // need to make sure that we don't get stuck vectorizing our own 1763 // shuffle chains. 1764 bool HasNontrivialInsts = false; 1765 1766 // The node weights represent the cost savings associated with 1767 // fusing the pair of instructions. 1768 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1769 E = PrunedTree.end(); S != E; ++S) { 1770 if (!isa<ShuffleVectorInst>(S->first) && 1771 !isa<InsertElementInst>(S->first) && 1772 !isa<ExtractElementInst>(S->first)) 1773 HasNontrivialInsts = true; 1774 1775 bool FlipOrder = false; 1776 1777 if (getDepthFactor(S->first)) { 1778 int ESContrib = CandidatePairCostSavings.find(*S)->second; 1779 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" 1780 << *S->first << " <-> " << *S->second << "} = " << 1781 ESContrib << "\n"); 1782 EffSize += ESContrib; 1783 } 1784 1785 // The edge weights contribute in a negative sense: they represent 1786 // the cost of shuffles. 1787 VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); 1788 if (IP.first != ConnectedPairDeps.end()) { 1789 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 1790 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; 1791 Q != IP.second; ++Q) { 1792 if (!PrunedTree.count(Q->second)) 1793 continue; 1794 DenseMap<VPPair, unsigned>::iterator R = 1795 PairConnectionTypes.find(VPPair(Q->second, Q->first)); 1796 assert(R != PairConnectionTypes.end() && 1797 "Cannot find pair connection type"); 1798 if (R->second == PairConnectionDirect) 1799 ++NumDepsDirect; 1800 else if (R->second == PairConnectionSwap) 1801 ++NumDepsSwap; 1802 } 1803 1804 // If there are more swaps than direct connections, then 1805 // the pair order will be flipped during fusion. So the real 1806 // number of swaps is the minimum number. 1807 FlipOrder = !FixedOrderPairs.count(*S) && 1808 ((NumDepsSwap > NumDepsDirect) || 1809 FixedOrderPairs.count(ValuePair(S->second, S->first))); 1810 1811 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; 1812 Q != IP.second; ++Q) { 1813 if (!PrunedTree.count(Q->second)) 1814 continue; 1815 DenseMap<VPPair, unsigned>::iterator R = 1816 PairConnectionTypes.find(VPPair(Q->second, Q->first)); 1817 assert(R != PairConnectionTypes.end() && 1818 "Cannot find pair connection type"); 1819 Type *Ty1 = Q->second.first->getType(), 1820 *Ty2 = Q->second.second->getType(); 1821 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1822 if ((R->second == PairConnectionDirect && FlipOrder) || 1823 (R->second == PairConnectionSwap && !FlipOrder) || 1824 R->second == PairConnectionSplat) { 1825 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1826 VTy, VTy); 1827 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1828 *Q->second.first << " <-> " << *Q->second.second << 1829 "} -> {" << 1830 *S->first << " <-> " << *S->second << "} = " << 1831 ESContrib << "\n"); 1832 EffSize -= ESContrib; 1833 } 1834 } 1835 } 1836 1837 // Compute the cost of outgoing edges. We assume that edges outgoing 1838 // to shuffles, inserts or extracts can be merged, and so contribute 1839 // no additional cost. 1840 if (!S->first->getType()->isVoidTy()) { 1841 Type *Ty1 = S->first->getType(), 1842 *Ty2 = S->second->getType(); 1843 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1844 1845 bool NeedsExtraction = false; 1846 for (Value::use_iterator I = S->first->use_begin(), 1847 IE = S->first->use_end(); I != IE; ++I) { 1848 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1849 // Shuffle can be folded if it has no other input 1850 if (isa<UndefValue>(SI->getOperand(1))) 1851 continue; 1852 } 1853 if (isa<ExtractElementInst>(*I)) 1854 continue; 1855 if (PrunedTreeInstrs.count(*I)) 1856 continue; 1857 NeedsExtraction = true; 1858 break; 1859 } 1860 1861 if (NeedsExtraction) { 1862 int ESContrib; 1863 if (Ty1->isVectorTy()) 1864 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1865 Ty1, VTy); 1866 else 1867 ESContrib = (int) TTI->getVectorInstrCost( 1868 Instruction::ExtractElement, VTy, 0); 1869 1870 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1871 *S->first << "} = " << ESContrib << "\n"); 1872 EffSize -= ESContrib; 1873 } 1874 1875 NeedsExtraction = false; 1876 for (Value::use_iterator I = S->second->use_begin(), 1877 IE = S->second->use_end(); I != IE; ++I) { 1878 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1879 // Shuffle can be folded if it has no other input 1880 if (isa<UndefValue>(SI->getOperand(1))) 1881 continue; 1882 } 1883 if (isa<ExtractElementInst>(*I)) 1884 continue; 1885 if (PrunedTreeInstrs.count(*I)) 1886 continue; 1887 NeedsExtraction = true; 1888 break; 1889 } 1890 1891 if (NeedsExtraction) { 1892 int ESContrib; 1893 if (Ty2->isVectorTy()) 1894 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1895 Ty2, VTy); 1896 else 1897 ESContrib = (int) TTI->getVectorInstrCost( 1898 Instruction::ExtractElement, VTy, 1); 1899 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1900 *S->second << "} = " << ESContrib << "\n"); 1901 EffSize -= ESContrib; 1902 } 1903 } 1904 1905 // Compute the cost of incoming edges. 1906 if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { 1907 Instruction *S1 = cast<Instruction>(S->first), 1908 *S2 = cast<Instruction>(S->second); 1909 for (unsigned o = 0; o < S1->getNumOperands(); ++o) { 1910 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); 1911 1912 // Combining constants into vector constants (or small vector 1913 // constants into larger ones are assumed free). 1914 if (isa<Constant>(O1) && isa<Constant>(O2)) 1915 continue; 1916 1917 if (FlipOrder) 1918 std::swap(O1, O2); 1919 1920 ValuePair VP = ValuePair(O1, O2); 1921 ValuePair VPR = ValuePair(O2, O1); 1922 1923 // Internal edges are not handled here. 1924 if (PrunedTree.count(VP) || PrunedTree.count(VPR)) 1925 continue; 1926 1927 Type *Ty1 = O1->getType(), 1928 *Ty2 = O2->getType(); 1929 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1930 1931 // Combining vector operations of the same type is also assumed 1932 // folded with other operations. 1933 if (Ty1 == Ty2) { 1934 // If both are insert elements, then both can be widened. 1935 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), 1936 *IEO2 = dyn_cast<InsertElementInst>(O2); 1937 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) 1938 continue; 1939 // If both are extract elements, and both have the same input 1940 // type, then they can be replaced with a shuffle 1941 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), 1942 *EIO2 = dyn_cast<ExtractElementInst>(O2); 1943 if (EIO1 && EIO2 && 1944 EIO1->getOperand(0)->getType() == 1945 EIO2->getOperand(0)->getType()) 1946 continue; 1947 // If both are a shuffle with equal operand types and only two 1948 // unqiue operands, then they can be replaced with a single 1949 // shuffle 1950 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), 1951 *SIO2 = dyn_cast<ShuffleVectorInst>(O2); 1952 if (SIO1 && SIO2 && 1953 SIO1->getOperand(0)->getType() == 1954 SIO2->getOperand(0)->getType()) { 1955 SmallSet<Value *, 4> SIOps; 1956 SIOps.insert(SIO1->getOperand(0)); 1957 SIOps.insert(SIO1->getOperand(1)); 1958 SIOps.insert(SIO2->getOperand(0)); 1959 SIOps.insert(SIO2->getOperand(1)); 1960 if (SIOps.size() <= 2) 1961 continue; 1962 } 1963 } 1964 1965 int ESContrib; 1966 // This pair has already been formed. 1967 if (IncomingPairs.count(VP)) { 1968 continue; 1969 } else if (IncomingPairs.count(VPR)) { 1970 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1971 VTy, VTy); 1972 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { 1973 ESContrib = (int) TTI->getVectorInstrCost( 1974 Instruction::InsertElement, VTy, 0); 1975 ESContrib += (int) TTI->getVectorInstrCost( 1976 Instruction::InsertElement, VTy, 1); 1977 } else if (!Ty1->isVectorTy()) { 1978 // O1 needs to be inserted into a vector of size O2, and then 1979 // both need to be shuffled together. 1980 ESContrib = (int) TTI->getVectorInstrCost( 1981 Instruction::InsertElement, Ty2, 0); 1982 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 1983 VTy, Ty2); 1984 } else if (!Ty2->isVectorTy()) { 1985 // O2 needs to be inserted into a vector of size O1, and then 1986 // both need to be shuffled together. 1987 ESContrib = (int) TTI->getVectorInstrCost( 1988 Instruction::InsertElement, Ty1, 0); 1989 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 1990 VTy, Ty1); 1991 } else { 1992 Type *TyBig = Ty1, *TySmall = Ty2; 1993 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) 1994 std::swap(TyBig, TySmall); 1995 1996 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1997 VTy, TyBig); 1998 if (TyBig != TySmall) 1999 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2000 TyBig, TySmall); 2001 } 2002 2003 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" 2004 << *O1 << " <-> " << *O2 << "} = " << 2005 ESContrib << "\n"); 2006 EffSize -= ESContrib; 2007 IncomingPairs.insert(VP); 2008 } 2009 } 2010 } 2011 2012 if (!HasNontrivialInsts) { 2013 DEBUG(if (DebugPairSelection) dbgs() << 2014 "\tNo non-trivial instructions in tree;" 2015 " override to zero effective size\n"); 2016 EffSize = 0; 2017 } 2018 } else { 2019 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 2020 E = PrunedTree.end(); S != E; ++S) 2021 EffSize += (int) getDepthFactor(S->first); 2022 } 2023 2024 DEBUG(if (DebugPairSelection) 2025 dbgs() << "BBV: found pruned Tree for pair {" 2026 << *J->first << " <-> " << *J->second << "} of depth " << 2027 MaxDepth << " and size " << PrunedTree.size() << 2028 " (effective size: " << EffSize << ")\n"); 2029 if (((TTI && !UseChainDepthWithTI) || 2030 MaxDepth >= Config.ReqChainDepth) && 2031 EffSize > 0 && EffSize > BestEffSize) { 2032 BestMaxDepth = MaxDepth; 2033 BestEffSize = EffSize; 2034 BestTree = PrunedTree; 2035 } 2036 } 2037 } 2038 2039 // Given the list of candidate pairs, this function selects those 2040 // that will be fused into vector instructions. 2041 void BBVectorize::choosePairs( 2042 std::multimap<Value *, Value *> &CandidatePairs, 2043 DenseMap<ValuePair, int> &CandidatePairCostSavings, 2044 std::vector<Value *> &PairableInsts, 2045 DenseSet<ValuePair> &FixedOrderPairs, 2046 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2047 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 2048 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, 2049 DenseSet<ValuePair> &PairableInstUsers, 2050 DenseMap<Value *, Value *>& ChosenPairs) { 2051 bool UseCycleCheck = 2052 CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; 2053 std::multimap<ValuePair, ValuePair> PairableInstUserMap; 2054 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 2055 E = PairableInsts.end(); I != E; ++I) { 2056 // The number of possible pairings for this variable: 2057 size_t NumChoices = CandidatePairs.count(*I); 2058 if (!NumChoices) continue; 2059 2060 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); 2061 2062 // The best pair to choose and its tree: 2063 size_t BestMaxDepth = 0; 2064 int BestEffSize = 0; 2065 DenseSet<ValuePair> BestTree; 2066 findBestTreeFor(CandidatePairs, CandidatePairCostSavings, 2067 PairableInsts, FixedOrderPairs, PairConnectionTypes, 2068 ConnectedPairs, ConnectedPairDeps, 2069 PairableInstUsers, PairableInstUserMap, ChosenPairs, 2070 BestTree, BestMaxDepth, BestEffSize, ChoiceRange, 2071 UseCycleCheck); 2072 2073 // A tree has been chosen (or not) at this point. If no tree was 2074 // chosen, then this instruction, I, cannot be paired (and is no longer 2075 // considered). 2076 2077 DEBUG(if (BestTree.size() > 0) 2078 dbgs() << "BBV: selected pairs in the best tree for: " 2079 << *cast<Instruction>(*I) << "\n"); 2080 2081 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 2082 SE2 = BestTree.end(); S != SE2; ++S) { 2083 // Insert the members of this tree into the list of chosen pairs. 2084 ChosenPairs.insert(ValuePair(S->first, S->second)); 2085 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 2086 *S->second << "\n"); 2087 2088 // Remove all candidate pairs that have values in the chosen tree. 2089 for (std::multimap<Value *, Value *>::iterator K = 2090 CandidatePairs.begin(); K != CandidatePairs.end();) { 2091 if (K->first == S->first || K->second == S->first || 2092 K->second == S->second || K->first == S->second) { 2093 // Don't remove the actual pair chosen so that it can be used 2094 // in subsequent tree selections. 2095 if (!(K->first == S->first && K->second == S->second)) 2096 CandidatePairs.erase(K++); 2097 else 2098 ++K; 2099 } else { 2100 ++K; 2101 } 2102 } 2103 } 2104 } 2105 2106 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 2107 } 2108 2109 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 2110 unsigned n = 0) { 2111 if (!I->hasName()) 2112 return ""; 2113 2114 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 2115 (n > 0 ? "." + utostr(n) : "")).str(); 2116 } 2117 2118 // Returns the value that is to be used as the pointer input to the vector 2119 // instruction that fuses I with J. 2120 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 2121 Instruction *I, Instruction *J, unsigned o) { 2122 Value *IPtr, *JPtr; 2123 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 2124 int64_t OffsetInElmts; 2125 2126 // Note: the analysis might fail here, that is why the pair order has 2127 // been precomputed (OffsetInElmts must be unused here). 2128 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 2129 IAddressSpace, JAddressSpace, 2130 OffsetInElmts, false); 2131 2132 // The pointer value is taken to be the one with the lowest offset. 2133 Value *VPtr = IPtr; 2134 2135 Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); 2136 Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); 2137 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2138 Type *VArgPtrType = PointerType::get(VArgType, 2139 cast<PointerType>(IPtr->getType())->getAddressSpace()); 2140 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 2141 /* insert before */ I); 2142 } 2143 2144 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 2145 unsigned MaskOffset, unsigned NumInElem, 2146 unsigned NumInElem1, unsigned IdxOffset, 2147 std::vector<Constant*> &Mask) { 2148 unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); 2149 for (unsigned v = 0; v < NumElem1; ++v) { 2150 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 2151 if (m < 0) { 2152 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 2153 } else { 2154 unsigned mm = m + (int) IdxOffset; 2155 if (m >= (int) NumInElem1) 2156 mm += (int) NumInElem; 2157 2158 Mask[v+MaskOffset] = 2159 ConstantInt::get(Type::getInt32Ty(Context), mm); 2160 } 2161 } 2162 } 2163 2164 // Returns the value that is to be used as the vector-shuffle mask to the 2165 // vector instruction that fuses I with J. 2166 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 2167 Instruction *I, Instruction *J) { 2168 // This is the shuffle mask. We need to append the second 2169 // mask to the first, and the numbers need to be adjusted. 2170 2171 Type *ArgTypeI = I->getType(); 2172 Type *ArgTypeJ = J->getType(); 2173 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2174 2175 unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); 2176 2177 // Get the total number of elements in the fused vector type. 2178 // By definition, this must equal the number of elements in 2179 // the final mask. 2180 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 2181 std::vector<Constant*> Mask(NumElem); 2182 2183 Type *OpTypeI = I->getOperand(0)->getType(); 2184 unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); 2185 Type *OpTypeJ = J->getOperand(0)->getType(); 2186 unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); 2187 2188 // The fused vector will be: 2189 // ----------------------------------------------------- 2190 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | 2191 // ----------------------------------------------------- 2192 // from which we'll extract NumElem total elements (where the first NumElemI 2193 // of them come from the mask in I and the remainder come from the mask 2194 // in J. 2195 2196 // For the mask from the first pair... 2197 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, 2198 0, Mask); 2199 2200 // For the mask from the second pair... 2201 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, 2202 NumInElemI, Mask); 2203 2204 return ConstantVector::get(Mask); 2205 } 2206 2207 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, 2208 Instruction *J, unsigned o, Value *&LOp, 2209 unsigned numElemL, 2210 Type *ArgTypeL, Type *ArgTypeH, 2211 bool IBeforeJ, unsigned IdxOff) { 2212 bool ExpandedIEChain = false; 2213 if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { 2214 // If we have a pure insertelement chain, then this can be rewritten 2215 // into a chain that directly builds the larger type. 2216 if (isPureIEChain(LIE)) { 2217 SmallVector<Value *, 8> VectElemts(numElemL, 2218 UndefValue::get(ArgTypeL->getScalarType())); 2219 InsertElementInst *LIENext = LIE; 2220 do { 2221 unsigned Idx = 2222 cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); 2223 VectElemts[Idx] = LIENext->getOperand(1); 2224 } while ((LIENext = 2225 dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); 2226 2227 LIENext = 0; 2228 Value *LIEPrev = UndefValue::get(ArgTypeH); 2229 for (unsigned i = 0; i < numElemL; ++i) { 2230 if (isa<UndefValue>(VectElemts[i])) continue; 2231 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], 2232 ConstantInt::get(Type::getInt32Ty(Context), 2233 i + IdxOff), 2234 getReplacementName(IBeforeJ ? I : J, 2235 true, o, i+1)); 2236 LIENext->insertBefore(IBeforeJ ? J : I); 2237 LIEPrev = LIENext; 2238 } 2239 2240 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); 2241 ExpandedIEChain = true; 2242 } 2243 } 2244 2245 return ExpandedIEChain; 2246 } 2247 2248 // Returns the value to be used as the specified operand of the vector 2249 // instruction that fuses I with J. 2250 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 2251 Instruction *J, unsigned o, bool IBeforeJ) { 2252 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2253 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 2254 2255 // Compute the fused vector type for this operand 2256 Type *ArgTypeI = I->getOperand(o)->getType(); 2257 Type *ArgTypeJ = J->getOperand(o)->getType(); 2258 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2259 2260 Instruction *L = I, *H = J; 2261 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; 2262 2263 unsigned numElemL; 2264 if (ArgTypeL->isVectorTy()) 2265 numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); 2266 else 2267 numElemL = 1; 2268 2269 unsigned numElemH; 2270 if (ArgTypeH->isVectorTy()) 2271 numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); 2272 else 2273 numElemH = 1; 2274 2275 Value *LOp = L->getOperand(o); 2276 Value *HOp = H->getOperand(o); 2277 unsigned numElem = VArgType->getNumElements(); 2278 2279 // First, we check if we can reuse the "original" vector outputs (if these 2280 // exist). We might need a shuffle. 2281 ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); 2282 ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); 2283 ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); 2284 ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); 2285 2286 // FIXME: If we're fusing shuffle instructions, then we can't apply this 2287 // optimization. The input vectors to the shuffle might be a different 2288 // length from the shuffle outputs. Unfortunately, the replacement 2289 // shuffle mask has already been formed, and the mask entries are sensitive 2290 // to the sizes of the inputs. 2291 bool IsSizeChangeShuffle = 2292 isa<ShuffleVectorInst>(L) && 2293 (LOp->getType() != L->getType() || HOp->getType() != H->getType()); 2294 2295 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { 2296 // We can have at most two unique vector inputs. 2297 bool CanUseInputs = true; 2298 Value *I1, *I2 = 0; 2299 if (LEE) { 2300 I1 = LEE->getOperand(0); 2301 } else { 2302 I1 = LSV->getOperand(0); 2303 I2 = LSV->getOperand(1); 2304 if (I2 == I1 || isa<UndefValue>(I2)) 2305 I2 = 0; 2306 } 2307 2308 if (HEE) { 2309 Value *I3 = HEE->getOperand(0); 2310 if (!I2 && I3 != I1) 2311 I2 = I3; 2312 else if (I3 != I1 && I3 != I2) 2313 CanUseInputs = false; 2314 } else { 2315 Value *I3 = HSV->getOperand(0); 2316 if (!I2 && I3 != I1) 2317 I2 = I3; 2318 else if (I3 != I1 && I3 != I2) 2319 CanUseInputs = false; 2320 2321 if (CanUseInputs) { 2322 Value *I4 = HSV->getOperand(1); 2323 if (!isa<UndefValue>(I4)) { 2324 if (!I2 && I4 != I1) 2325 I2 = I4; 2326 else if (I4 != I1 && I4 != I2) 2327 CanUseInputs = false; 2328 } 2329 } 2330 } 2331 2332 if (CanUseInputs) { 2333 unsigned LOpElem = 2334 cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) 2335 ->getNumElements(); 2336 unsigned HOpElem = 2337 cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) 2338 ->getNumElements(); 2339 2340 // We have one or two input vectors. We need to map each index of the 2341 // operands to the index of the original vector. 2342 SmallVector<std::pair<int, int>, 8> II(numElem); 2343 for (unsigned i = 0; i < numElemL; ++i) { 2344 int Idx, INum; 2345 if (LEE) { 2346 Idx = 2347 cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); 2348 INum = LEE->getOperand(0) == I1 ? 0 : 1; 2349 } else { 2350 Idx = LSV->getMaskValue(i); 2351 if (Idx < (int) LOpElem) { 2352 INum = LSV->getOperand(0) == I1 ? 0 : 1; 2353 } else { 2354 Idx -= LOpElem; 2355 INum = LSV->getOperand(1) == I1 ? 0 : 1; 2356 } 2357 } 2358 2359 II[i] = std::pair<int, int>(Idx, INum); 2360 } 2361 for (unsigned i = 0; i < numElemH; ++i) { 2362 int Idx, INum; 2363 if (HEE) { 2364 Idx = 2365 cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); 2366 INum = HEE->getOperand(0) == I1 ? 0 : 1; 2367 } else { 2368 Idx = HSV->getMaskValue(i); 2369 if (Idx < (int) HOpElem) { 2370 INum = HSV->getOperand(0) == I1 ? 0 : 1; 2371 } else { 2372 Idx -= HOpElem; 2373 INum = HSV->getOperand(1) == I1 ? 0 : 1; 2374 } 2375 } 2376 2377 II[i + numElemL] = std::pair<int, int>(Idx, INum); 2378 } 2379 2380 // We now have an array which tells us from which index of which 2381 // input vector each element of the operand comes. 2382 VectorType *I1T = cast<VectorType>(I1->getType()); 2383 unsigned I1Elem = I1T->getNumElements(); 2384 2385 if (!I2) { 2386 // In this case there is only one underlying vector input. Check for 2387 // the trivial case where we can use the input directly. 2388 if (I1Elem == numElem) { 2389 bool ElemInOrder = true; 2390 for (unsigned i = 0; i < numElem; ++i) { 2391 if (II[i].first != (int) i && II[i].first != -1) { 2392 ElemInOrder = false; 2393 break; 2394 } 2395 } 2396 2397 if (ElemInOrder) 2398 return I1; 2399 } 2400 2401 // A shuffle is needed. 2402 std::vector<Constant *> Mask(numElem); 2403 for (unsigned i = 0; i < numElem; ++i) { 2404 int Idx = II[i].first; 2405 if (Idx == -1) 2406 Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); 2407 else 2408 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2409 } 2410 2411 Instruction *S = 2412 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2413 ConstantVector::get(Mask), 2414 getReplacementName(IBeforeJ ? I : J, 2415 true, o)); 2416 S->insertBefore(IBeforeJ ? J : I); 2417 return S; 2418 } 2419 2420 VectorType *I2T = cast<VectorType>(I2->getType()); 2421 unsigned I2Elem = I2T->getNumElements(); 2422 2423 // This input comes from two distinct vectors. The first step is to 2424 // make sure that both vectors are the same length. If not, the 2425 // smaller one will need to grow before they can be shuffled together. 2426 if (I1Elem < I2Elem) { 2427 std::vector<Constant *> Mask(I2Elem); 2428 unsigned v = 0; 2429 for (; v < I1Elem; ++v) 2430 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2431 for (; v < I2Elem; ++v) 2432 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2433 2434 Instruction *NewI1 = 2435 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2436 ConstantVector::get(Mask), 2437 getReplacementName(IBeforeJ ? I : J, 2438 true, o, 1)); 2439 NewI1->insertBefore(IBeforeJ ? J : I); 2440 I1 = NewI1; 2441 I1T = I2T; 2442 I1Elem = I2Elem; 2443 } else if (I1Elem > I2Elem) { 2444 std::vector<Constant *> Mask(I1Elem); 2445 unsigned v = 0; 2446 for (; v < I2Elem; ++v) 2447 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2448 for (; v < I1Elem; ++v) 2449 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2450 2451 Instruction *NewI2 = 2452 new ShuffleVectorInst(I2, UndefValue::get(I2T), 2453 ConstantVector::get(Mask), 2454 getReplacementName(IBeforeJ ? I : J, 2455 true, o, 1)); 2456 NewI2->insertBefore(IBeforeJ ? J : I); 2457 I2 = NewI2; 2458 I2T = I1T; 2459 I2Elem = I1Elem; 2460 } 2461 2462 // Now that both I1 and I2 are the same length we can shuffle them 2463 // together (and use the result). 2464 std::vector<Constant *> Mask(numElem); 2465 for (unsigned v = 0; v < numElem; ++v) { 2466 if (II[v].first == -1) { 2467 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2468 } else { 2469 int Idx = II[v].first + II[v].second * I1Elem; 2470 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2471 } 2472 } 2473 2474 Instruction *NewOp = 2475 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), 2476 getReplacementName(IBeforeJ ? I : J, true, o)); 2477 NewOp->insertBefore(IBeforeJ ? J : I); 2478 return NewOp; 2479 } 2480 } 2481 2482 Type *ArgType = ArgTypeL; 2483 if (numElemL < numElemH) { 2484 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, 2485 ArgTypeL, VArgType, IBeforeJ, 1)) { 2486 // This is another short-circuit case: we're combining a scalar into 2487 // a vector that is formed by an IE chain. We've just expanded the IE 2488 // chain, now insert the scalar and we're done. 2489 2490 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, 2491 getReplacementName(IBeforeJ ? I : J, true, o)); 2492 S->insertBefore(IBeforeJ ? J : I); 2493 return S; 2494 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, 2495 ArgTypeH, IBeforeJ)) { 2496 // The two vector inputs to the shuffle must be the same length, 2497 // so extend the smaller vector to be the same length as the larger one. 2498 Instruction *NLOp; 2499 if (numElemL > 1) { 2500 2501 std::vector<Constant *> Mask(numElemH); 2502 unsigned v = 0; 2503 for (; v < numElemL; ++v) 2504 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2505 for (; v < numElemH; ++v) 2506 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2507 2508 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), 2509 ConstantVector::get(Mask), 2510 getReplacementName(IBeforeJ ? I : J, 2511 true, o, 1)); 2512 } else { 2513 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, 2514 getReplacementName(IBeforeJ ? I : J, 2515 true, o, 1)); 2516 } 2517 2518 NLOp->insertBefore(IBeforeJ ? J : I); 2519 LOp = NLOp; 2520 } 2521 2522 ArgType = ArgTypeH; 2523 } else if (numElemL > numElemH) { 2524 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, 2525 ArgTypeH, VArgType, IBeforeJ)) { 2526 Instruction *S = 2527 InsertElementInst::Create(LOp, HOp, 2528 ConstantInt::get(Type::getInt32Ty(Context), 2529 numElemL), 2530 getReplacementName(IBeforeJ ? I : J, 2531 true, o)); 2532 S->insertBefore(IBeforeJ ? J : I); 2533 return S; 2534 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, 2535 ArgTypeL, IBeforeJ)) { 2536 Instruction *NHOp; 2537 if (numElemH > 1) { 2538 std::vector<Constant *> Mask(numElemL); 2539 unsigned v = 0; 2540 for (; v < numElemH; ++v) 2541 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2542 for (; v < numElemL; ++v) 2543 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2544 2545 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), 2546 ConstantVector::get(Mask), 2547 getReplacementName(IBeforeJ ? I : J, 2548 true, o, 1)); 2549 } else { 2550 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, 2551 getReplacementName(IBeforeJ ? I : J, 2552 true, o, 1)); 2553 } 2554 2555 NHOp->insertBefore(IBeforeJ ? J : I); 2556 HOp = NHOp; 2557 } 2558 } 2559 2560 if (ArgType->isVectorTy()) { 2561 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 2562 std::vector<Constant*> Mask(numElem); 2563 for (unsigned v = 0; v < numElem; ++v) { 2564 unsigned Idx = v; 2565 // If the low vector was expanded, we need to skip the extra 2566 // undefined entries. 2567 if (v >= numElemL && numElemH > numElemL) 2568 Idx += (numElemH - numElemL); 2569 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2570 } 2571 2572 Instruction *BV = new ShuffleVectorInst(LOp, HOp, 2573 ConstantVector::get(Mask), 2574 getReplacementName(IBeforeJ ? I : J, true, o)); 2575 BV->insertBefore(IBeforeJ ? J : I); 2576 return BV; 2577 } 2578 2579 Instruction *BV1 = InsertElementInst::Create( 2580 UndefValue::get(VArgType), LOp, CV0, 2581 getReplacementName(IBeforeJ ? I : J, 2582 true, o, 1)); 2583 BV1->insertBefore(IBeforeJ ? J : I); 2584 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, 2585 getReplacementName(IBeforeJ ? I : J, 2586 true, o, 2)); 2587 BV2->insertBefore(IBeforeJ ? J : I); 2588 return BV2; 2589 } 2590 2591 // This function creates an array of values that will be used as the inputs 2592 // to the vector instruction that fuses I with J. 2593 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 2594 Instruction *I, Instruction *J, 2595 SmallVector<Value *, 3> &ReplacedOperands, 2596 bool IBeforeJ) { 2597 unsigned NumOperands = I->getNumOperands(); 2598 2599 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 2600 // Iterate backward so that we look at the store pointer 2601 // first and know whether or not we need to flip the inputs. 2602 2603 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 2604 // This is the pointer for a load/store instruction. 2605 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); 2606 continue; 2607 } else if (isa<CallInst>(I)) { 2608 Function *F = cast<CallInst>(I)->getCalledFunction(); 2609 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 2610 if (o == NumOperands-1) { 2611 BasicBlock &BB = *I->getParent(); 2612 2613 Module *M = BB.getParent()->getParent(); 2614 Type *ArgTypeI = I->getType(); 2615 Type *ArgTypeJ = J->getType(); 2616 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2617 2618 ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); 2619 continue; 2620 } else if (IID == Intrinsic::powi && o == 1) { 2621 // The second argument of powi is a single integer and we've already 2622 // checked that both arguments are equal. As a result, we just keep 2623 // I's second argument. 2624 ReplacedOperands[o] = I->getOperand(o); 2625 continue; 2626 } 2627 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 2628 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 2629 continue; 2630 } 2631 2632 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); 2633 } 2634 } 2635 2636 // This function creates two values that represent the outputs of the 2637 // original I and J instructions. These are generally vector shuffles 2638 // or extracts. In many cases, these will end up being unused and, thus, 2639 // eliminated by later passes. 2640 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 2641 Instruction *J, Instruction *K, 2642 Instruction *&InsertionPt, 2643 Instruction *&K1, Instruction *&K2) { 2644 if (isa<StoreInst>(I)) { 2645 AA->replaceWithNewValue(I, K); 2646 AA->replaceWithNewValue(J, K); 2647 } else { 2648 Type *IType = I->getType(); 2649 Type *JType = J->getType(); 2650 2651 VectorType *VType = getVecTypeForPair(IType, JType); 2652 unsigned numElem = VType->getNumElements(); 2653 2654 unsigned numElemI, numElemJ; 2655 if (IType->isVectorTy()) 2656 numElemI = cast<VectorType>(IType)->getNumElements(); 2657 else 2658 numElemI = 1; 2659 2660 if (JType->isVectorTy()) 2661 numElemJ = cast<VectorType>(JType)->getNumElements(); 2662 else 2663 numElemJ = 1; 2664 2665 if (IType->isVectorTy()) { 2666 std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); 2667 for (unsigned v = 0; v < numElemI; ++v) { 2668 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2669 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); 2670 } 2671 2672 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 2673 ConstantVector::get( Mask1), 2674 getReplacementName(K, false, 1)); 2675 } else { 2676 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2677 K1 = ExtractElementInst::Create(K, CV0, 2678 getReplacementName(K, false, 1)); 2679 } 2680 2681 if (JType->isVectorTy()) { 2682 std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); 2683 for (unsigned v = 0; v < numElemJ; ++v) { 2684 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2685 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); 2686 } 2687 2688 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 2689 ConstantVector::get( Mask2), 2690 getReplacementName(K, false, 2)); 2691 } else { 2692 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); 2693 K2 = ExtractElementInst::Create(K, CV1, 2694 getReplacementName(K, false, 2)); 2695 } 2696 2697 K1->insertAfter(K); 2698 K2->insertAfter(K1); 2699 InsertionPt = K2; 2700 } 2701 } 2702 2703 // Move all uses of the function I (including pairing-induced uses) after J. 2704 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 2705 std::multimap<Value *, Value *> &LoadMoveSet, 2706 Instruction *I, Instruction *J) { 2707 // Skip to the first instruction past I. 2708 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2709 2710 DenseSet<Value *> Users; 2711 AliasSetTracker WriteSet(*AA); 2712 for (; cast<Instruction>(L) != J; ++L) 2713 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); 2714 2715 assert(cast<Instruction>(L) == J && 2716 "Tracking has not proceeded far enough to check for dependencies"); 2717 // If J is now in the use set of I, then trackUsesOfI will return true 2718 // and we have a dependency cycle (and the fusing operation must abort). 2719 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); 2720 } 2721 2722 // Move all uses of the function I (including pairing-induced uses) after J. 2723 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 2724 std::multimap<Value *, Value *> &LoadMoveSet, 2725 Instruction *&InsertionPt, 2726 Instruction *I, Instruction *J) { 2727 // Skip to the first instruction past I. 2728 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2729 2730 DenseSet<Value *> Users; 2731 AliasSetTracker WriteSet(*AA); 2732 for (; cast<Instruction>(L) != J;) { 2733 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { 2734 // Move this instruction 2735 Instruction *InstToMove = L; ++L; 2736 2737 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 2738 " to after " << *InsertionPt << "\n"); 2739 InstToMove->removeFromParent(); 2740 InstToMove->insertAfter(InsertionPt); 2741 InsertionPt = InstToMove; 2742 } else { 2743 ++L; 2744 } 2745 } 2746 } 2747 2748 // Collect all load instruction that are in the move set of a given first 2749 // pair member. These loads depend on the first instruction, I, and so need 2750 // to be moved after J (the second instruction) when the pair is fused. 2751 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 2752 DenseMap<Value *, Value *> &ChosenPairs, 2753 std::multimap<Value *, Value *> &LoadMoveSet, 2754 Instruction *I) { 2755 // Skip to the first instruction past I. 2756 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2757 2758 DenseSet<Value *> Users; 2759 AliasSetTracker WriteSet(*AA); 2760 2761 // Note: We cannot end the loop when we reach J because J could be moved 2762 // farther down the use chain by another instruction pairing. Also, J 2763 // could be before I if this is an inverted input. 2764 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 2765 if (trackUsesOfI(Users, WriteSet, I, L)) { 2766 if (L->mayReadFromMemory()) 2767 LoadMoveSet.insert(ValuePair(L, I)); 2768 } 2769 } 2770 } 2771 2772 // In cases where both load/stores and the computation of their pointers 2773 // are chosen for vectorization, we can end up in a situation where the 2774 // aliasing analysis starts returning different query results as the 2775 // process of fusing instruction pairs continues. Because the algorithm 2776 // relies on finding the same use trees here as were found earlier, we'll 2777 // need to precompute the necessary aliasing information here and then 2778 // manually update it during the fusion process. 2779 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 2780 std::vector<Value *> &PairableInsts, 2781 DenseMap<Value *, Value *> &ChosenPairs, 2782 std::multimap<Value *, Value *> &LoadMoveSet) { 2783 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 2784 PIE = PairableInsts.end(); PI != PIE; ++PI) { 2785 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 2786 if (P == ChosenPairs.end()) continue; 2787 2788 Instruction *I = cast<Instruction>(P->first); 2789 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); 2790 } 2791 } 2792 2793 // When the first instruction in each pair is cloned, it will inherit its 2794 // parent's metadata. This metadata must be combined with that of the other 2795 // instruction in a safe way. 2796 void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { 2797 SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; 2798 K->getAllMetadataOtherThanDebugLoc(Metadata); 2799 for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { 2800 unsigned Kind = Metadata[i].first; 2801 MDNode *JMD = J->getMetadata(Kind); 2802 MDNode *KMD = Metadata[i].second; 2803 2804 switch (Kind) { 2805 default: 2806 K->setMetadata(Kind, 0); // Remove unknown metadata 2807 break; 2808 case LLVMContext::MD_tbaa: 2809 K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 2810 break; 2811 case LLVMContext::MD_fpmath: 2812 K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 2813 break; 2814 } 2815 } 2816 } 2817 2818 // This function fuses the chosen instruction pairs into vector instructions, 2819 // taking care preserve any needed scalar outputs and, then, it reorders the 2820 // remaining instructions as needed (users of the first member of the pair 2821 // need to be moved to after the location of the second member of the pair 2822 // because the vector instruction is inserted in the location of the pair's 2823 // second member). 2824 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 2825 std::vector<Value *> &PairableInsts, 2826 DenseMap<Value *, Value *> &ChosenPairs, 2827 DenseSet<ValuePair> &FixedOrderPairs, 2828 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2829 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 2830 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) { 2831 LLVMContext& Context = BB.getContext(); 2832 2833 // During the vectorization process, the order of the pairs to be fused 2834 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 2835 // list. After a pair is fused, the flipped pair is removed from the list. 2836 DenseSet<ValuePair> FlippedPairs; 2837 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 2838 E = ChosenPairs.end(); P != E; ++P) 2839 FlippedPairs.insert(ValuePair(P->second, P->first)); 2840 for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), 2841 E = FlippedPairs.end(); P != E; ++P) 2842 ChosenPairs.insert(*P); 2843 2844 std::multimap<Value *, Value *> LoadMoveSet; 2845 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); 2846 2847 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 2848 2849 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 2850 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 2851 if (P == ChosenPairs.end()) { 2852 ++PI; 2853 continue; 2854 } 2855 2856 if (getDepthFactor(P->first) == 0) { 2857 // These instructions are not really fused, but are tracked as though 2858 // they are. Any case in which it would be interesting to fuse them 2859 // will be taken care of by InstCombine. 2860 --NumFusedOps; 2861 ++PI; 2862 continue; 2863 } 2864 2865 Instruction *I = cast<Instruction>(P->first), 2866 *J = cast<Instruction>(P->second); 2867 2868 DEBUG(dbgs() << "BBV: fusing: " << *I << 2869 " <-> " << *J << "\n"); 2870 2871 // Remove the pair and flipped pair from the list. 2872 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 2873 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 2874 ChosenPairs.erase(FP); 2875 ChosenPairs.erase(P); 2876 2877 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { 2878 DEBUG(dbgs() << "BBV: fusion of: " << *I << 2879 " <-> " << *J << 2880 " aborted because of non-trivial dependency cycle\n"); 2881 --NumFusedOps; 2882 ++PI; 2883 continue; 2884 } 2885 2886 // If the pair must have the other order, then flip it. 2887 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); 2888 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { 2889 // This pair does not have a fixed order, and so we might want to 2890 // flip it if that will yield fewer shuffles. We count the number 2891 // of dependencies connected via swaps, and those directly connected, 2892 // and flip the order if the number of swaps is greater. 2893 bool OrigOrder = true; 2894 VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J)); 2895 if (IP.first == ConnectedPairDeps.end()) { 2896 IP = ConnectedPairDeps.equal_range(ValuePair(J, I)); 2897 OrigOrder = false; 2898 } 2899 2900 if (IP.first != ConnectedPairDeps.end()) { 2901 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 2902 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; 2903 Q != IP.second; ++Q) { 2904 DenseMap<VPPair, unsigned>::iterator R = 2905 PairConnectionTypes.find(VPPair(Q->second, Q->first)); 2906 assert(R != PairConnectionTypes.end() && 2907 "Cannot find pair connection type"); 2908 if (R->second == PairConnectionDirect) 2909 ++NumDepsDirect; 2910 else if (R->second == PairConnectionSwap) 2911 ++NumDepsSwap; 2912 } 2913 2914 if (!OrigOrder) 2915 std::swap(NumDepsDirect, NumDepsSwap); 2916 2917 if (NumDepsSwap > NumDepsDirect) { 2918 FlipPairOrder = true; 2919 DEBUG(dbgs() << "BBV: reordering pair: " << *I << 2920 " <-> " << *J << "\n"); 2921 } 2922 } 2923 } 2924 2925 Instruction *L = I, *H = J; 2926 if (FlipPairOrder) 2927 std::swap(H, L); 2928 2929 // If the pair being fused uses the opposite order from that in the pair 2930 // connection map, then we need to flip the types. 2931 VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L)); 2932 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; 2933 Q != IP.second; ++Q) { 2934 DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q); 2935 assert(R != PairConnectionTypes.end() && 2936 "Cannot find pair connection type"); 2937 if (R->second == PairConnectionDirect) 2938 R->second = PairConnectionSwap; 2939 else if (R->second == PairConnectionSwap) 2940 R->second = PairConnectionDirect; 2941 } 2942 2943 bool LBeforeH = !FlipPairOrder; 2944 unsigned NumOperands = I->getNumOperands(); 2945 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 2946 getReplacementInputsForPair(Context, L, H, ReplacedOperands, 2947 LBeforeH); 2948 2949 // Make a copy of the original operation, change its type to the vector 2950 // type and replace its operands with the vector operands. 2951 Instruction *K = L->clone(); 2952 if (L->hasName()) 2953 K->takeName(L); 2954 else if (H->hasName()) 2955 K->takeName(H); 2956 2957 if (!isa<StoreInst>(K)) 2958 K->mutateType(getVecTypeForPair(L->getType(), H->getType())); 2959 2960 combineMetadata(K, H); 2961 K->intersectOptionalDataWith(H); 2962 2963 for (unsigned o = 0; o < NumOperands; ++o) 2964 K->setOperand(o, ReplacedOperands[o]); 2965 2966 K->insertAfter(J); 2967 2968 // Instruction insertion point: 2969 Instruction *InsertionPt = K; 2970 Instruction *K1 = 0, *K2 = 0; 2971 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); 2972 2973 // The use tree of the first original instruction must be moved to after 2974 // the location of the second instruction. The entire use tree of the 2975 // first instruction is disjoint from the input tree of the second 2976 // (by definition), and so commutes with it. 2977 2978 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); 2979 2980 if (!isa<StoreInst>(I)) { 2981 L->replaceAllUsesWith(K1); 2982 H->replaceAllUsesWith(K2); 2983 AA->replaceWithNewValue(L, K1); 2984 AA->replaceWithNewValue(H, K2); 2985 } 2986 2987 // Instructions that may read from memory may be in the load move set. 2988 // Once an instruction is fused, we no longer need its move set, and so 2989 // the values of the map never need to be updated. However, when a load 2990 // is fused, we need to merge the entries from both instructions in the 2991 // pair in case those instructions were in the move set of some other 2992 // yet-to-be-fused pair. The loads in question are the keys of the map. 2993 if (I->mayReadFromMemory()) { 2994 std::vector<ValuePair> NewSetMembers; 2995 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); 2996 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); 2997 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; 2998 N != IPairRange.second; ++N) 2999 NewSetMembers.push_back(ValuePair(K, N->second)); 3000 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; 3001 N != JPairRange.second; ++N) 3002 NewSetMembers.push_back(ValuePair(K, N->second)); 3003 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 3004 AE = NewSetMembers.end(); A != AE; ++A) 3005 LoadMoveSet.insert(*A); 3006 } 3007 3008 // Before removing I, set the iterator to the next instruction. 3009 PI = llvm::next(BasicBlock::iterator(I)); 3010 if (cast<Instruction>(PI) == J) 3011 ++PI; 3012 3013 SE->forgetValue(I); 3014 SE->forgetValue(J); 3015 I->eraseFromParent(); 3016 J->eraseFromParent(); 3017 3018 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << 3019 BB << "\n"); 3020 } 3021 3022 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 3023 } 3024} 3025 3026char BBVectorize::ID = 0; 3027static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 3028INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3029INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3030INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3031INITIALIZE_PASS_DEPENDENCY(DominatorTree) 3032INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3033INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3034 3035BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 3036 return new BBVectorize(C); 3037} 3038 3039bool 3040llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 3041 BBVectorize BBVectorizer(P, C); 3042 return BBVectorizer.vectorizeBB(BB); 3043} 3044 3045//===----------------------------------------------------------------------===// 3046VectorizeConfig::VectorizeConfig() { 3047 VectorBits = ::VectorBits; 3048 VectorizeBools = !::NoBools; 3049 VectorizeInts = !::NoInts; 3050 VectorizeFloats = !::NoFloats; 3051 VectorizePointers = !::NoPointers; 3052 VectorizeCasts = !::NoCasts; 3053 VectorizeMath = !::NoMath; 3054 VectorizeFMA = !::NoFMA; 3055 VectorizeSelect = !::NoSelect; 3056 VectorizeCmp = !::NoCmp; 3057 VectorizeGEP = !::NoGEP; 3058 VectorizeMemOps = !::NoMemOps; 3059 AlignedOnly = ::AlignedOnly; 3060 ReqChainDepth= ::ReqChainDepth; 3061 SearchLimit = ::SearchLimit; 3062 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 3063 SplatBreaksChain = ::SplatBreaksChain; 3064 MaxInsts = ::MaxInsts; 3065 MaxIter = ::MaxIter; 3066 Pow2LenOnly = ::Pow2LenOnly; 3067 NoMemOpBoost = ::NoMemOpBoost; 3068 FastDep = ::FastDep; 3069} 3070