BBVectorize.cpp revision 97a241b173a1413df5a93fdd891ddfac36dabad9
1//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements a basic-block vectorization pass. The algorithm was 11// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12// et al. It works by looking for chains of pairable operations and then 13// pairing them. 14// 15//===----------------------------------------------------------------------===// 16 17#define BBV_NAME "bb-vectorize" 18#define DEBUG_TYPE BBV_NAME 19#include "llvm/Transforms/Vectorize.h" 20#include "llvm/ADT/DenseMap.h" 21#include "llvm/ADT/DenseSet.h" 22#include "llvm/ADT/STLExtras.h" 23#include "llvm/ADT/SmallSet.h" 24#include "llvm/ADT/SmallVector.h" 25#include "llvm/ADT/Statistic.h" 26#include "llvm/ADT/StringExtras.h" 27#include "llvm/Analysis/AliasAnalysis.h" 28#include "llvm/Analysis/AliasSetTracker.h" 29#include "llvm/Analysis/Dominators.h" 30#include "llvm/Analysis/ScalarEvolution.h" 31#include "llvm/Analysis/ScalarEvolutionExpressions.h" 32#include "llvm/Analysis/TargetTransformInfo.h" 33#include "llvm/Analysis/ValueTracking.h" 34#include "llvm/IR/Constants.h" 35#include "llvm/IR/DataLayout.h" 36#include "llvm/IR/DerivedTypes.h" 37#include "llvm/IR/Function.h" 38#include "llvm/IR/Instructions.h" 39#include "llvm/IR/IntrinsicInst.h" 40#include "llvm/IR/Intrinsics.h" 41#include "llvm/IR/LLVMContext.h" 42#include "llvm/IR/Metadata.h" 43#include "llvm/IR/Type.h" 44#include "llvm/Pass.h" 45#include "llvm/Support/CommandLine.h" 46#include "llvm/Support/Debug.h" 47#include "llvm/Support/ValueHandle.h" 48#include "llvm/Support/raw_ostream.h" 49#include "llvm/Transforms/Utils/Local.h" 50#include <algorithm> 51using namespace llvm; 52 53static cl::opt<bool> 54IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), 55 cl::Hidden, cl::desc("Ignore target information")); 56 57static cl::opt<unsigned> 58ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 59 cl::desc("The required chain depth for vectorization")); 60 61static cl::opt<bool> 62UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), 63 cl::Hidden, cl::desc("Use the chain depth requirement with" 64 " target information")); 65 66static cl::opt<unsigned> 67SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 68 cl::desc("The maximum search distance for instruction pairs")); 69 70static cl::opt<bool> 71SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 72 cl::desc("Replicating one element to a pair breaks the chain")); 73 74static cl::opt<unsigned> 75VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 76 cl::desc("The size of the native vector registers")); 77 78static cl::opt<unsigned> 79MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 80 cl::desc("The maximum number of pairing iterations")); 81 82static cl::opt<bool> 83Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, 84 cl::desc("Don't try to form non-2^n-length vectors")); 85 86static cl::opt<unsigned> 87MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 88 cl::desc("The maximum number of pairable instructions per group")); 89 90static cl::opt<unsigned> 91MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 92 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 93 " a full cycle check")); 94 95static cl::opt<bool> 96NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 97 cl::desc("Don't try to vectorize boolean (i1) values")); 98 99static cl::opt<bool> 100NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 101 cl::desc("Don't try to vectorize integer values")); 102 103static cl::opt<bool> 104NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 105 cl::desc("Don't try to vectorize floating-point values")); 106 107// FIXME: This should default to false once pointer vector support works. 108static cl::opt<bool> 109NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, 110 cl::desc("Don't try to vectorize pointer values")); 111 112static cl::opt<bool> 113NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 114 cl::desc("Don't try to vectorize casting (conversion) operations")); 115 116static cl::opt<bool> 117NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 118 cl::desc("Don't try to vectorize floating-point math intrinsics")); 119 120static cl::opt<bool> 121NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 122 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 123 124static cl::opt<bool> 125NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 126 cl::desc("Don't try to vectorize select instructions")); 127 128static cl::opt<bool> 129NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 130 cl::desc("Don't try to vectorize comparison instructions")); 131 132static cl::opt<bool> 133NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 134 cl::desc("Don't try to vectorize getelementptr instructions")); 135 136static cl::opt<bool> 137NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 138 cl::desc("Don't try to vectorize loads and stores")); 139 140static cl::opt<bool> 141AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 142 cl::desc("Only generate aligned loads and stores")); 143 144static cl::opt<bool> 145NoMemOpBoost("bb-vectorize-no-mem-op-boost", 146 cl::init(false), cl::Hidden, 147 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 148 149static cl::opt<bool> 150FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 151 cl::desc("Use a fast instruction dependency analysis")); 152 153#ifndef NDEBUG 154static cl::opt<bool> 155DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 156 cl::init(false), cl::Hidden, 157 cl::desc("When debugging is enabled, output information on the" 158 " instruction-examination process")); 159static cl::opt<bool> 160DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 161 cl::init(false), cl::Hidden, 162 cl::desc("When debugging is enabled, output information on the" 163 " candidate-selection process")); 164static cl::opt<bool> 165DebugPairSelection("bb-vectorize-debug-pair-selection", 166 cl::init(false), cl::Hidden, 167 cl::desc("When debugging is enabled, output information on the" 168 " pair-selection process")); 169static cl::opt<bool> 170DebugCycleCheck("bb-vectorize-debug-cycle-check", 171 cl::init(false), cl::Hidden, 172 cl::desc("When debugging is enabled, output information on the" 173 " cycle-checking process")); 174 175static cl::opt<bool> 176PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", 177 cl::init(false), cl::Hidden, 178 cl::desc("When debugging is enabled, dump the basic block after" 179 " every pair is fused")); 180#endif 181 182STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 183 184namespace { 185 struct BBVectorize : public BasicBlockPass { 186 static char ID; // Pass identification, replacement for typeid 187 188 const VectorizeConfig Config; 189 190 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 191 : BasicBlockPass(ID), Config(C) { 192 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 193 } 194 195 BBVectorize(Pass *P, const VectorizeConfig &C) 196 : BasicBlockPass(ID), Config(C) { 197 AA = &P->getAnalysis<AliasAnalysis>(); 198 DT = &P->getAnalysis<DominatorTree>(); 199 SE = &P->getAnalysis<ScalarEvolution>(); 200 TD = P->getAnalysisIfAvailable<DataLayout>(); 201 TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); 202 } 203 204 typedef std::pair<Value *, Value *> ValuePair; 205 typedef std::pair<ValuePair, int> ValuePairWithCost; 206 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 207 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 208 typedef std::pair<VPPair, unsigned> VPPairWithType; 209 210 AliasAnalysis *AA; 211 DominatorTree *DT; 212 ScalarEvolution *SE; 213 DataLayout *TD; 214 const TargetTransformInfo *TTI; 215 216 // FIXME: const correct? 217 218 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); 219 220 bool getCandidatePairs(BasicBlock &BB, 221 BasicBlock::iterator &Start, 222 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 223 DenseSet<ValuePair> &FixedOrderPairs, 224 DenseMap<ValuePair, int> &CandidatePairCostSavings, 225 std::vector<Value *> &PairableInsts, bool NonPow2Len); 226 227 // FIXME: The current implementation does not account for pairs that 228 // are connected in multiple ways. For example: 229 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) 230 enum PairConnectionType { 231 PairConnectionDirect, 232 PairConnectionSwap, 233 PairConnectionSplat 234 }; 235 236 void computeConnectedPairs( 237 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 238 DenseSet<ValuePair> &CandidatePairsSet, 239 std::vector<Value *> &PairableInsts, 240 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 241 DenseMap<VPPair, unsigned> &PairConnectionTypes); 242 243 void buildDepMap(BasicBlock &BB, 244 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 245 std::vector<Value *> &PairableInsts, 246 DenseSet<ValuePair> &PairableInstUsers); 247 248 void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 249 DenseSet<ValuePair> &CandidatePairsSet, 250 DenseMap<ValuePair, int> &CandidatePairCostSavings, 251 std::vector<Value *> &PairableInsts, 252 DenseSet<ValuePair> &FixedOrderPairs, 253 DenseMap<VPPair, unsigned> &PairConnectionTypes, 254 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 255 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 256 DenseSet<ValuePair> &PairableInstUsers, 257 DenseMap<Value *, Value *>& ChosenPairs); 258 259 void fuseChosenPairs(BasicBlock &BB, 260 std::vector<Value *> &PairableInsts, 261 DenseMap<Value *, Value *>& ChosenPairs, 262 DenseSet<ValuePair> &FixedOrderPairs, 263 DenseMap<VPPair, unsigned> &PairConnectionTypes, 264 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 265 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); 266 267 268 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 269 270 bool areInstsCompatible(Instruction *I, Instruction *J, 271 bool IsSimpleLoadStore, bool NonPow2Len, 272 int &CostSavings, int &FixedOrder); 273 274 bool trackUsesOfI(DenseSet<Value *> &Users, 275 AliasSetTracker &WriteSet, Instruction *I, 276 Instruction *J, bool UpdateUsers = true, 277 DenseSet<ValuePair> *LoadMoveSetPairs = 0); 278 279 void computePairsConnectedTo( 280 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 281 DenseSet<ValuePair> &CandidatePairsSet, 282 std::vector<Value *> &PairableInsts, 283 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 284 DenseMap<VPPair, unsigned> &PairConnectionTypes, 285 ValuePair P); 286 287 bool pairsConflict(ValuePair P, ValuePair Q, 288 DenseSet<ValuePair> &PairableInstUsers, 289 DenseMap<ValuePair, std::vector<ValuePair> > 290 *PairableInstUserMap = 0, 291 DenseSet<VPPair> *PairableInstUserPairSet = 0); 292 293 bool pairWillFormCycle(ValuePair P, 294 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, 295 DenseSet<ValuePair> &CurrentPairs); 296 297 void pruneTreeFor( 298 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 299 std::vector<Value *> &PairableInsts, 300 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 301 DenseSet<ValuePair> &PairableInstUsers, 302 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 303 DenseSet<VPPair> &PairableInstUserPairSet, 304 DenseMap<Value *, Value *> &ChosenPairs, 305 DenseMap<ValuePair, size_t> &Tree, 306 DenseSet<ValuePair> &PrunedTree, ValuePair J, 307 bool UseCycleCheck); 308 309 void buildInitialTreeFor( 310 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 311 DenseSet<ValuePair> &CandidatePairsSet, 312 std::vector<Value *> &PairableInsts, 313 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 314 DenseSet<ValuePair> &PairableInstUsers, 315 DenseMap<Value *, Value *> &ChosenPairs, 316 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 317 318 void findBestTreeFor( 319 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 320 DenseSet<ValuePair> &CandidatePairsSet, 321 DenseMap<ValuePair, int> &CandidatePairCostSavings, 322 std::vector<Value *> &PairableInsts, 323 DenseSet<ValuePair> &FixedOrderPairs, 324 DenseMap<VPPair, unsigned> &PairConnectionTypes, 325 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 326 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 327 DenseSet<ValuePair> &PairableInstUsers, 328 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 329 DenseSet<VPPair> &PairableInstUserPairSet, 330 DenseMap<Value *, Value *> &ChosenPairs, 331 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 332 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 333 bool UseCycleCheck); 334 335 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 336 Instruction *J, unsigned o); 337 338 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 339 unsigned MaskOffset, unsigned NumInElem, 340 unsigned NumInElem1, unsigned IdxOffset, 341 std::vector<Constant*> &Mask); 342 343 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 344 Instruction *J); 345 346 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, 347 unsigned o, Value *&LOp, unsigned numElemL, 348 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, 349 unsigned IdxOff = 0); 350 351 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 352 Instruction *J, unsigned o, bool IBeforeJ); 353 354 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 355 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 356 bool IBeforeJ); 357 358 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 359 Instruction *J, Instruction *K, 360 Instruction *&InsertionPt, Instruction *&K1, 361 Instruction *&K2); 362 363 void collectPairLoadMoveSet(BasicBlock &BB, 364 DenseMap<Value *, Value *> &ChosenPairs, 365 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 366 DenseSet<ValuePair> &LoadMoveSetPairs, 367 Instruction *I); 368 369 void collectLoadMoveSet(BasicBlock &BB, 370 std::vector<Value *> &PairableInsts, 371 DenseMap<Value *, Value *> &ChosenPairs, 372 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 373 DenseSet<ValuePair> &LoadMoveSetPairs); 374 375 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 376 DenseSet<ValuePair> &LoadMoveSetPairs, 377 Instruction *I, Instruction *J); 378 379 void moveUsesOfIAfterJ(BasicBlock &BB, 380 DenseSet<ValuePair> &LoadMoveSetPairs, 381 Instruction *&InsertionPt, 382 Instruction *I, Instruction *J); 383 384 void combineMetadata(Instruction *K, const Instruction *J); 385 386 bool vectorizeBB(BasicBlock &BB) { 387 if (!DT->isReachableFromEntry(&BB)) { 388 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << 389 " in " << BB.getParent()->getName() << "\n"); 390 return false; 391 } 392 393 DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); 394 395 bool changed = false; 396 // Iterate a sufficient number of times to merge types of size 1 bit, 397 // then 2 bits, then 4, etc. up to half of the target vector width of the 398 // target vector register. 399 unsigned n = 1; 400 for (unsigned v = 2; 401 (TTI || v <= Config.VectorBits) && 402 (!Config.MaxIter || n <= Config.MaxIter); 403 v *= 2, ++n) { 404 DEBUG(dbgs() << "BBV: fusing loop #" << n << 405 " for " << BB.getName() << " in " << 406 BB.getParent()->getName() << "...\n"); 407 if (vectorizePairs(BB)) 408 changed = true; 409 else 410 break; 411 } 412 413 if (changed && !Pow2LenOnly) { 414 ++n; 415 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { 416 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << 417 n << " for " << BB.getName() << " in " << 418 BB.getParent()->getName() << "...\n"); 419 if (!vectorizePairs(BB, true)) break; 420 } 421 } 422 423 DEBUG(dbgs() << "BBV: done!\n"); 424 return changed; 425 } 426 427 virtual bool runOnBasicBlock(BasicBlock &BB) { 428 AA = &getAnalysis<AliasAnalysis>(); 429 DT = &getAnalysis<DominatorTree>(); 430 SE = &getAnalysis<ScalarEvolution>(); 431 TD = getAnalysisIfAvailable<DataLayout>(); 432 TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); 433 434 return vectorizeBB(BB); 435 } 436 437 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 438 BasicBlockPass::getAnalysisUsage(AU); 439 AU.addRequired<AliasAnalysis>(); 440 AU.addRequired<DominatorTree>(); 441 AU.addRequired<ScalarEvolution>(); 442 AU.addRequired<TargetTransformInfo>(); 443 AU.addPreserved<AliasAnalysis>(); 444 AU.addPreserved<DominatorTree>(); 445 AU.addPreserved<ScalarEvolution>(); 446 AU.setPreservesCFG(); 447 } 448 449 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { 450 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && 451 "Cannot form vector from incompatible scalar types"); 452 Type *STy = ElemTy->getScalarType(); 453 454 unsigned numElem; 455 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 456 numElem = VTy->getNumElements(); 457 } else { 458 numElem = 1; 459 } 460 461 if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { 462 numElem += VTy->getNumElements(); 463 } else { 464 numElem += 1; 465 } 466 467 return VectorType::get(STy, numElem); 468 } 469 470 static inline void getInstructionTypes(Instruction *I, 471 Type *&T1, Type *&T2) { 472 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 473 // For stores, it is the value type, not the pointer type that matters 474 // because the value is what will come from a vector register. 475 476 Value *IVal = SI->getValueOperand(); 477 T1 = IVal->getType(); 478 } else { 479 T1 = I->getType(); 480 } 481 482 if (CastInst *CI = dyn_cast<CastInst>(I)) 483 T2 = CI->getSrcTy(); 484 else 485 T2 = T1; 486 487 if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 488 T2 = SI->getCondition()->getType(); 489 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 490 T2 = SI->getOperand(0)->getType(); 491 } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { 492 T2 = CI->getOperand(0)->getType(); 493 } 494 } 495 496 // Returns the weight associated with the provided value. A chain of 497 // candidate pairs has a length given by the sum of the weights of its 498 // members (one weight per pair; the weight of each member of the pair 499 // is assumed to be the same). This length is then compared to the 500 // chain-length threshold to determine if a given chain is significant 501 // enough to be vectorized. The length is also used in comparing 502 // candidate chains where longer chains are considered to be better. 503 // Note: when this function returns 0, the resulting instructions are 504 // not actually fused. 505 inline size_t getDepthFactor(Value *V) { 506 // InsertElement and ExtractElement have a depth factor of zero. This is 507 // for two reasons: First, they cannot be usefully fused. Second, because 508 // the pass generates a lot of these, they can confuse the simple metric 509 // used to compare the trees in the next iteration. Thus, giving them a 510 // weight of zero allows the pass to essentially ignore them in 511 // subsequent iterations when looking for vectorization opportunities 512 // while still tracking dependency chains that flow through those 513 // instructions. 514 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 515 return 0; 516 517 // Give a load or store half of the required depth so that load/store 518 // pairs will vectorize. 519 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 520 return Config.ReqChainDepth/2; 521 522 return 1; 523 } 524 525 // Returns the cost of the provided instruction using TTI. 526 // This does not handle loads and stores. 527 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { 528 switch (Opcode) { 529 default: break; 530 case Instruction::GetElementPtr: 531 // We mark this instruction as zero-cost because scalar GEPs are usually 532 // lowered to the intruction addressing mode. At the moment we don't 533 // generate vector GEPs. 534 return 0; 535 case Instruction::Br: 536 return TTI->getCFInstrCost(Opcode); 537 case Instruction::PHI: 538 return 0; 539 case Instruction::Add: 540 case Instruction::FAdd: 541 case Instruction::Sub: 542 case Instruction::FSub: 543 case Instruction::Mul: 544 case Instruction::FMul: 545 case Instruction::UDiv: 546 case Instruction::SDiv: 547 case Instruction::FDiv: 548 case Instruction::URem: 549 case Instruction::SRem: 550 case Instruction::FRem: 551 case Instruction::Shl: 552 case Instruction::LShr: 553 case Instruction::AShr: 554 case Instruction::And: 555 case Instruction::Or: 556 case Instruction::Xor: 557 return TTI->getArithmeticInstrCost(Opcode, T1); 558 case Instruction::Select: 559 case Instruction::ICmp: 560 case Instruction::FCmp: 561 return TTI->getCmpSelInstrCost(Opcode, T1, T2); 562 case Instruction::ZExt: 563 case Instruction::SExt: 564 case Instruction::FPToUI: 565 case Instruction::FPToSI: 566 case Instruction::FPExt: 567 case Instruction::PtrToInt: 568 case Instruction::IntToPtr: 569 case Instruction::SIToFP: 570 case Instruction::UIToFP: 571 case Instruction::Trunc: 572 case Instruction::FPTrunc: 573 case Instruction::BitCast: 574 case Instruction::ShuffleVector: 575 return TTI->getCastInstrCost(Opcode, T1, T2); 576 } 577 578 return 1; 579 } 580 581 // This determines the relative offset of two loads or stores, returning 582 // true if the offset could be determined to be some constant value. 583 // For example, if OffsetInElmts == 1, then J accesses the memory directly 584 // after I; if OffsetInElmts == -1 then I accesses the memory 585 // directly after J. 586 bool getPairPtrInfo(Instruction *I, Instruction *J, 587 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 588 unsigned &IAddressSpace, unsigned &JAddressSpace, 589 int64_t &OffsetInElmts, bool ComputeOffset = true) { 590 OffsetInElmts = 0; 591 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 592 LoadInst *LJ = cast<LoadInst>(J); 593 IPtr = LI->getPointerOperand(); 594 JPtr = LJ->getPointerOperand(); 595 IAlignment = LI->getAlignment(); 596 JAlignment = LJ->getAlignment(); 597 IAddressSpace = LI->getPointerAddressSpace(); 598 JAddressSpace = LJ->getPointerAddressSpace(); 599 } else { 600 StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); 601 IPtr = SI->getPointerOperand(); 602 JPtr = SJ->getPointerOperand(); 603 IAlignment = SI->getAlignment(); 604 JAlignment = SJ->getAlignment(); 605 IAddressSpace = SI->getPointerAddressSpace(); 606 JAddressSpace = SJ->getPointerAddressSpace(); 607 } 608 609 if (!ComputeOffset) 610 return true; 611 612 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 613 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 614 615 // If this is a trivial offset, then we'll get something like 616 // 1*sizeof(type). With target data, which we need anyway, this will get 617 // constant folded into a number. 618 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 619 if (const SCEVConstant *ConstOffSCEV = 620 dyn_cast<SCEVConstant>(OffsetSCEV)) { 621 ConstantInt *IntOff = ConstOffSCEV->getValue(); 622 int64_t Offset = IntOff->getSExtValue(); 623 624 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 625 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 626 627 Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); 628 if (VTy != VTy2 && Offset < 0) { 629 int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); 630 OffsetInElmts = Offset/VTy2TSS; 631 return (abs64(Offset) % VTy2TSS) == 0; 632 } 633 634 OffsetInElmts = Offset/VTyTSS; 635 return (abs64(Offset) % VTyTSS) == 0; 636 } 637 638 return false; 639 } 640 641 // Returns true if the provided CallInst represents an intrinsic that can 642 // be vectorized. 643 bool isVectorizableIntrinsic(CallInst* I) { 644 Function *F = I->getCalledFunction(); 645 if (!F) return false; 646 647 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 648 if (!IID) return false; 649 650 switch(IID) { 651 default: 652 return false; 653 case Intrinsic::sqrt: 654 case Intrinsic::powi: 655 case Intrinsic::sin: 656 case Intrinsic::cos: 657 case Intrinsic::log: 658 case Intrinsic::log2: 659 case Intrinsic::log10: 660 case Intrinsic::exp: 661 case Intrinsic::exp2: 662 case Intrinsic::pow: 663 return Config.VectorizeMath; 664 case Intrinsic::fma: 665 case Intrinsic::fmuladd: 666 return Config.VectorizeFMA; 667 } 668 } 669 670 bool isPureIEChain(InsertElementInst *IE) { 671 InsertElementInst *IENext = IE; 672 do { 673 if (!isa<UndefValue>(IENext->getOperand(0)) && 674 !isa<InsertElementInst>(IENext->getOperand(0))) { 675 return false; 676 } 677 } while ((IENext = 678 dyn_cast<InsertElementInst>(IENext->getOperand(0)))); 679 680 return true; 681 } 682 }; 683 684 // This function implements one vectorization iteration on the provided 685 // basic block. It returns true if the block is changed. 686 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { 687 bool ShouldContinue; 688 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 689 690 std::vector<Value *> AllPairableInsts; 691 DenseMap<Value *, Value *> AllChosenPairs; 692 DenseSet<ValuePair> AllFixedOrderPairs; 693 DenseMap<VPPair, unsigned> AllPairConnectionTypes; 694 DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, 695 AllConnectedPairDeps; 696 697 do { 698 std::vector<Value *> PairableInsts; 699 DenseMap<Value *, std::vector<Value *> > CandidatePairs; 700 DenseSet<ValuePair> FixedOrderPairs; 701 DenseMap<ValuePair, int> CandidatePairCostSavings; 702 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 703 FixedOrderPairs, 704 CandidatePairCostSavings, 705 PairableInsts, NonPow2Len); 706 if (PairableInsts.empty()) continue; 707 708 // Build the candidate pair set for faster lookups. 709 DenseSet<ValuePair> CandidatePairsSet; 710 for (DenseMap<Value *, std::vector<Value *> >::iterator I = 711 CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) 712 for (std::vector<Value *>::iterator J = I->second.begin(), 713 JE = I->second.end(); J != JE; ++J) 714 CandidatePairsSet.insert(ValuePair(I->first, *J)); 715 716 // Now we have a map of all of the pairable instructions and we need to 717 // select the best possible pairing. A good pairing is one such that the 718 // users of the pair are also paired. This defines a (directed) forest 719 // over the pairs such that two pairs are connected iff the second pair 720 // uses the first. 721 722 // Note that it only matters that both members of the second pair use some 723 // element of the first pair (to allow for splatting). 724 725 DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, 726 ConnectedPairDeps; 727 DenseMap<VPPair, unsigned> PairConnectionTypes; 728 computeConnectedPairs(CandidatePairs, CandidatePairsSet, 729 PairableInsts, ConnectedPairs, PairConnectionTypes); 730 if (ConnectedPairs.empty()) continue; 731 732 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 733 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 734 I != IE; ++I) 735 for (std::vector<ValuePair>::iterator J = I->second.begin(), 736 JE = I->second.end(); J != JE; ++J) 737 ConnectedPairDeps[*J].push_back(I->first); 738 739 // Build the pairable-instruction dependency map 740 DenseSet<ValuePair> PairableInstUsers; 741 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 742 743 // There is now a graph of the connected pairs. For each variable, pick 744 // the pairing with the largest tree meeting the depth requirement on at 745 // least one branch. Then select all pairings that are part of that tree 746 // and remove them from the list of available pairings and pairable 747 // variables. 748 749 DenseMap<Value *, Value *> ChosenPairs; 750 choosePairs(CandidatePairs, CandidatePairsSet, 751 CandidatePairCostSavings, 752 PairableInsts, FixedOrderPairs, PairConnectionTypes, 753 ConnectedPairs, ConnectedPairDeps, 754 PairableInstUsers, ChosenPairs); 755 756 if (ChosenPairs.empty()) continue; 757 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 758 PairableInsts.end()); 759 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 760 761 // Only for the chosen pairs, propagate information on fixed-order pairs, 762 // pair connections, and their types to the data structures used by the 763 // pair fusion procedures. 764 for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), 765 IE = ChosenPairs.end(); I != IE; ++I) { 766 if (FixedOrderPairs.count(*I)) 767 AllFixedOrderPairs.insert(*I); 768 else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) 769 AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); 770 771 for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); 772 J != IE; ++J) { 773 DenseMap<VPPair, unsigned>::iterator K = 774 PairConnectionTypes.find(VPPair(*I, *J)); 775 if (K != PairConnectionTypes.end()) { 776 AllPairConnectionTypes.insert(*K); 777 } else { 778 K = PairConnectionTypes.find(VPPair(*J, *I)); 779 if (K != PairConnectionTypes.end()) 780 AllPairConnectionTypes.insert(*K); 781 } 782 } 783 } 784 785 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 786 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 787 I != IE; ++I) 788 for (std::vector<ValuePair>::iterator J = I->second.begin(), 789 JE = I->second.end(); J != JE; ++J) 790 if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { 791 AllConnectedPairs[I->first].push_back(*J); 792 AllConnectedPairDeps[*J].push_back(I->first); 793 } 794 } while (ShouldContinue); 795 796 if (AllChosenPairs.empty()) return false; 797 NumFusedOps += AllChosenPairs.size(); 798 799 // A set of pairs has now been selected. It is now necessary to replace the 800 // paired instructions with vector instructions. For this procedure each 801 // operand must be replaced with a vector operand. This vector is formed 802 // by using build_vector on the old operands. The replaced values are then 803 // replaced with a vector_extract on the result. Subsequent optimization 804 // passes should coalesce the build/extract combinations. 805 806 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, 807 AllPairConnectionTypes, 808 AllConnectedPairs, AllConnectedPairDeps); 809 810 // It is important to cleanup here so that future iterations of this 811 // function have less work to do. 812 (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo()); 813 return true; 814 } 815 816 // This function returns true if the provided instruction is capable of being 817 // fused into a vector instruction. This determination is based only on the 818 // type and other attributes of the instruction. 819 bool BBVectorize::isInstVectorizable(Instruction *I, 820 bool &IsSimpleLoadStore) { 821 IsSimpleLoadStore = false; 822 823 if (CallInst *C = dyn_cast<CallInst>(I)) { 824 if (!isVectorizableIntrinsic(C)) 825 return false; 826 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 827 // Vectorize simple loads if possbile: 828 IsSimpleLoadStore = L->isSimple(); 829 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 830 return false; 831 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 832 // Vectorize simple stores if possbile: 833 IsSimpleLoadStore = S->isSimple(); 834 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 835 return false; 836 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 837 // We can vectorize casts, but not casts of pointer types, etc. 838 if (!Config.VectorizeCasts) 839 return false; 840 841 Type *SrcTy = C->getSrcTy(); 842 if (!SrcTy->isSingleValueType()) 843 return false; 844 845 Type *DestTy = C->getDestTy(); 846 if (!DestTy->isSingleValueType()) 847 return false; 848 } else if (isa<SelectInst>(I)) { 849 if (!Config.VectorizeSelect) 850 return false; 851 } else if (isa<CmpInst>(I)) { 852 if (!Config.VectorizeCmp) 853 return false; 854 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 855 if (!Config.VectorizeGEP) 856 return false; 857 858 // Currently, vector GEPs exist only with one index. 859 if (G->getNumIndices() != 1) 860 return false; 861 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 862 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 863 return false; 864 } 865 866 // We can't vectorize memory operations without target data 867 if (TD == 0 && IsSimpleLoadStore) 868 return false; 869 870 Type *T1, *T2; 871 getInstructionTypes(I, T1, T2); 872 873 // Not every type can be vectorized... 874 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 875 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 876 return false; 877 878 if (T1->getScalarSizeInBits() == 1) { 879 if (!Config.VectorizeBools) 880 return false; 881 } else { 882 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) 883 return false; 884 } 885 886 if (T2->getScalarSizeInBits() == 1) { 887 if (!Config.VectorizeBools) 888 return false; 889 } else { 890 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) 891 return false; 892 } 893 894 if (!Config.VectorizeFloats 895 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 896 return false; 897 898 // Don't vectorize target-specific types. 899 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 900 return false; 901 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 902 return false; 903 904 if ((!Config.VectorizePointers || TD == 0) && 905 (T1->getScalarType()->isPointerTy() || 906 T2->getScalarType()->isPointerTy())) 907 return false; 908 909 if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || 910 T2->getPrimitiveSizeInBits() >= Config.VectorBits)) 911 return false; 912 913 return true; 914 } 915 916 // This function returns true if the two provided instructions are compatible 917 // (meaning that they can be fused into a vector instruction). This assumes 918 // that I has already been determined to be vectorizable and that J is not 919 // in the use tree of I. 920 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 921 bool IsSimpleLoadStore, bool NonPow2Len, 922 int &CostSavings, int &FixedOrder) { 923 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 924 " <-> " << *J << "\n"); 925 926 CostSavings = 0; 927 FixedOrder = 0; 928 929 // Loads and stores can be merged if they have different alignments, 930 // but are otherwise the same. 931 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | 932 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) 933 return false; 934 935 Type *IT1, *IT2, *JT1, *JT2; 936 getInstructionTypes(I, IT1, IT2); 937 getInstructionTypes(J, JT1, JT2); 938 unsigned MaxTypeBits = std::max( 939 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), 940 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); 941 if (!TTI && MaxTypeBits > Config.VectorBits) 942 return false; 943 944 // FIXME: handle addsub-type operations! 945 946 if (IsSimpleLoadStore) { 947 Value *IPtr, *JPtr; 948 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 949 int64_t OffsetInElmts = 0; 950 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 951 IAddressSpace, JAddressSpace, 952 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 953 FixedOrder = (int) OffsetInElmts; 954 unsigned BottomAlignment = IAlignment; 955 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 956 957 Type *aTypeI = isa<StoreInst>(I) ? 958 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 959 Type *aTypeJ = isa<StoreInst>(J) ? 960 cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); 961 Type *VType = getVecTypeForPair(aTypeI, aTypeJ); 962 963 if (Config.AlignedOnly) { 964 // An aligned load or store is possible only if the instruction 965 // with the lower offset has an alignment suitable for the 966 // vector type. 967 968 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 969 if (BottomAlignment < VecAlignment) 970 return false; 971 } 972 973 if (TTI) { 974 unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, 975 IAlignment, IAddressSpace); 976 unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, 977 JAlignment, JAddressSpace); 978 unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, 979 BottomAlignment, 980 IAddressSpace); 981 982 ICost += TTI->getAddressComputationCost(aTypeI); 983 JCost += TTI->getAddressComputationCost(aTypeJ); 984 VCost += TTI->getAddressComputationCost(VType); 985 986 if (VCost > ICost + JCost) 987 return false; 988 989 // We don't want to fuse to a type that will be split, even 990 // if the two input types will also be split and there is no other 991 // associated cost. 992 unsigned VParts = TTI->getNumberOfParts(VType); 993 if (VParts > 1) 994 return false; 995 else if (!VParts && VCost == ICost + JCost) 996 return false; 997 998 CostSavings = ICost + JCost - VCost; 999 } 1000 } else { 1001 return false; 1002 } 1003 } else if (TTI) { 1004 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); 1005 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); 1006 Type *VT1 = getVecTypeForPair(IT1, JT1), 1007 *VT2 = getVecTypeForPair(IT2, JT2); 1008 1009 // Note that this procedure is incorrect for insert and extract element 1010 // instructions (because combining these often results in a shuffle), 1011 // but this cost is ignored (because insert and extract element 1012 // instructions are assigned a zero depth factor and are not really 1013 // fused in general). 1014 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); 1015 1016 if (VCost > ICost + JCost) 1017 return false; 1018 1019 // We don't want to fuse to a type that will be split, even 1020 // if the two input types will also be split and there is no other 1021 // associated cost. 1022 unsigned VParts1 = TTI->getNumberOfParts(VT1), 1023 VParts2 = TTI->getNumberOfParts(VT2); 1024 if (VParts1 > 1 || VParts2 > 1) 1025 return false; 1026 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) 1027 return false; 1028 1029 CostSavings = ICost + JCost - VCost; 1030 } 1031 1032 // The powi intrinsic is special because only the first argument is 1033 // vectorized, the second arguments must be equal. 1034 CallInst *CI = dyn_cast<CallInst>(I); 1035 Function *FI; 1036 if (CI && (FI = CI->getCalledFunction())) { 1037 Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); 1038 if (IID == Intrinsic::powi) { 1039 Value *A1I = CI->getArgOperand(1), 1040 *A1J = cast<CallInst>(J)->getArgOperand(1); 1041 const SCEV *A1ISCEV = SE->getSCEV(A1I), 1042 *A1JSCEV = SE->getSCEV(A1J); 1043 return (A1ISCEV == A1JSCEV); 1044 } 1045 1046 if (IID && TTI) { 1047 SmallVector<Type*, 4> Tys; 1048 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 1049 Tys.push_back(CI->getArgOperand(i)->getType()); 1050 unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); 1051 1052 Tys.clear(); 1053 CallInst *CJ = cast<CallInst>(J); 1054 for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) 1055 Tys.push_back(CJ->getArgOperand(i)->getType()); 1056 unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); 1057 1058 Tys.clear(); 1059 assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && 1060 "Intrinsic argument counts differ"); 1061 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1062 if (IID == Intrinsic::powi && i == 1) 1063 Tys.push_back(CI->getArgOperand(i)->getType()); 1064 else 1065 Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), 1066 CJ->getArgOperand(i)->getType())); 1067 } 1068 1069 Type *RetTy = getVecTypeForPair(IT1, JT1); 1070 unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); 1071 1072 if (VCost > ICost + JCost) 1073 return false; 1074 1075 // We don't want to fuse to a type that will be split, even 1076 // if the two input types will also be split and there is no other 1077 // associated cost. 1078 unsigned RetParts = TTI->getNumberOfParts(RetTy); 1079 if (RetParts > 1) 1080 return false; 1081 else if (!RetParts && VCost == ICost + JCost) 1082 return false; 1083 1084 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1085 if (!Tys[i]->isVectorTy()) 1086 continue; 1087 1088 unsigned NumParts = TTI->getNumberOfParts(Tys[i]); 1089 if (NumParts > 1) 1090 return false; 1091 else if (!NumParts && VCost == ICost + JCost) 1092 return false; 1093 } 1094 1095 CostSavings = ICost + JCost - VCost; 1096 } 1097 } 1098 1099 return true; 1100 } 1101 1102 // Figure out whether or not J uses I and update the users and write-set 1103 // structures associated with I. Specifically, Users represents the set of 1104 // instructions that depend on I. WriteSet represents the set 1105 // of memory locations that are dependent on I. If UpdateUsers is true, 1106 // and J uses I, then Users is updated to contain J and WriteSet is updated 1107 // to contain any memory locations to which J writes. The function returns 1108 // true if J uses I. By default, alias analysis is used to determine 1109 // whether J reads from memory that overlaps with a location in WriteSet. 1110 // If LoadMoveSet is not null, then it is a previously-computed map 1111 // where the key is the memory-based user instruction and the value is 1112 // the instruction to be compared with I. So, if LoadMoveSet is provided, 1113 // then the alias analysis is not used. This is necessary because this 1114 // function is called during the process of moving instructions during 1115 // vectorization and the results of the alias analysis are not stable during 1116 // that process. 1117 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 1118 AliasSetTracker &WriteSet, Instruction *I, 1119 Instruction *J, bool UpdateUsers, 1120 DenseSet<ValuePair> *LoadMoveSetPairs) { 1121 bool UsesI = false; 1122 1123 // This instruction may already be marked as a user due, for example, to 1124 // being a member of a selected pair. 1125 if (Users.count(J)) 1126 UsesI = true; 1127 1128 if (!UsesI) 1129 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 1130 JU != JE; ++JU) { 1131 Value *V = *JU; 1132 if (I == V || Users.count(V)) { 1133 UsesI = true; 1134 break; 1135 } 1136 } 1137 if (!UsesI && J->mayReadFromMemory()) { 1138 if (LoadMoveSetPairs) { 1139 UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); 1140 } else { 1141 for (AliasSetTracker::iterator W = WriteSet.begin(), 1142 WE = WriteSet.end(); W != WE; ++W) { 1143 if (W->aliasesUnknownInst(J, *AA)) { 1144 UsesI = true; 1145 break; 1146 } 1147 } 1148 } 1149 } 1150 1151 if (UsesI && UpdateUsers) { 1152 if (J->mayWriteToMemory()) WriteSet.add(J); 1153 Users.insert(J); 1154 } 1155 1156 return UsesI; 1157 } 1158 1159 // This function iterates over all instruction pairs in the provided 1160 // basic block and collects all candidate pairs for vectorization. 1161 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 1162 BasicBlock::iterator &Start, 1163 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1164 DenseSet<ValuePair> &FixedOrderPairs, 1165 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1166 std::vector<Value *> &PairableInsts, bool NonPow2Len) { 1167 BasicBlock::iterator E = BB.end(); 1168 if (Start == E) return false; 1169 1170 bool ShouldContinue = false, IAfterStart = false; 1171 for (BasicBlock::iterator I = Start++; I != E; ++I) { 1172 if (I == Start) IAfterStart = true; 1173 1174 bool IsSimpleLoadStore; 1175 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 1176 1177 // Look for an instruction with which to pair instruction *I... 1178 DenseSet<Value *> Users; 1179 AliasSetTracker WriteSet(*AA); 1180 bool JAfterStart = IAfterStart; 1181 BasicBlock::iterator J = llvm::next(I); 1182 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 1183 if (J == Start) JAfterStart = true; 1184 1185 // Determine if J uses I, if so, exit the loop. 1186 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 1187 if (Config.FastDep) { 1188 // Note: For this heuristic to be effective, independent operations 1189 // must tend to be intermixed. This is likely to be true from some 1190 // kinds of grouped loop unrolling (but not the generic LLVM pass), 1191 // but otherwise may require some kind of reordering pass. 1192 1193 // When using fast dependency analysis, 1194 // stop searching after first use: 1195 if (UsesI) break; 1196 } else { 1197 if (UsesI) continue; 1198 } 1199 1200 // J does not use I, and comes before the first use of I, so it can be 1201 // merged with I if the instructions are compatible. 1202 int CostSavings, FixedOrder; 1203 if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, 1204 CostSavings, FixedOrder)) continue; 1205 1206 // J is a candidate for merging with I. 1207 if (!PairableInsts.size() || 1208 PairableInsts[PairableInsts.size()-1] != I) { 1209 PairableInsts.push_back(I); 1210 } 1211 1212 CandidatePairs[I].push_back(J); 1213 if (TTI) 1214 CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), 1215 CostSavings)); 1216 1217 if (FixedOrder == 1) 1218 FixedOrderPairs.insert(ValuePair(I, J)); 1219 else if (FixedOrder == -1) 1220 FixedOrderPairs.insert(ValuePair(J, I)); 1221 1222 // The next call to this function must start after the last instruction 1223 // selected during this invocation. 1224 if (JAfterStart) { 1225 Start = llvm::next(J); 1226 IAfterStart = JAfterStart = false; 1227 } 1228 1229 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 1230 << *I << " <-> " << *J << " (cost savings: " << 1231 CostSavings << ")\n"); 1232 1233 // If we have already found too many pairs, break here and this function 1234 // will be called again starting after the last instruction selected 1235 // during this invocation. 1236 if (PairableInsts.size() >= Config.MaxInsts) { 1237 ShouldContinue = true; 1238 break; 1239 } 1240 } 1241 1242 if (ShouldContinue) 1243 break; 1244 } 1245 1246 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 1247 << " instructions with candidate pairs\n"); 1248 1249 return ShouldContinue; 1250 } 1251 1252 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 1253 // it looks for pairs such that both members have an input which is an 1254 // output of PI or PJ. 1255 void BBVectorize::computePairsConnectedTo( 1256 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1257 DenseSet<ValuePair> &CandidatePairsSet, 1258 std::vector<Value *> &PairableInsts, 1259 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1260 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1261 ValuePair P) { 1262 StoreInst *SI, *SJ; 1263 1264 // For each possible pairing for this variable, look at the uses of 1265 // the first value... 1266 for (Value::use_iterator I = P.first->use_begin(), 1267 E = P.first->use_end(); I != E; ++I) { 1268 if (isa<LoadInst>(*I)) { 1269 // A pair cannot be connected to a load because the load only takes one 1270 // operand (the address) and it is a scalar even after vectorization. 1271 continue; 1272 } else if ((SI = dyn_cast<StoreInst>(*I)) && 1273 P.first == SI->getPointerOperand()) { 1274 // Similarly, a pair cannot be connected to a store through its 1275 // pointer operand. 1276 continue; 1277 } 1278 1279 // For each use of the first variable, look for uses of the second 1280 // variable... 1281 for (Value::use_iterator J = P.second->use_begin(), 1282 E2 = P.second->use_end(); J != E2; ++J) { 1283 if ((SJ = dyn_cast<StoreInst>(*J)) && 1284 P.second == SJ->getPointerOperand()) 1285 continue; 1286 1287 // Look for <I, J>: 1288 if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1289 VPPair VP(P, ValuePair(*I, *J)); 1290 ConnectedPairs[VP.first].push_back(VP.second); 1291 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); 1292 } 1293 1294 // Look for <J, I>: 1295 if (CandidatePairsSet.count(ValuePair(*J, *I))) { 1296 VPPair VP(P, ValuePair(*J, *I)); 1297 ConnectedPairs[VP.first].push_back(VP.second); 1298 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); 1299 } 1300 } 1301 1302 if (Config.SplatBreaksChain) continue; 1303 // Look for cases where just the first value in the pair is used by 1304 // both members of another pair (splatting). 1305 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 1306 if ((SJ = dyn_cast<StoreInst>(*J)) && 1307 P.first == SJ->getPointerOperand()) 1308 continue; 1309 1310 if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1311 VPPair VP(P, ValuePair(*I, *J)); 1312 ConnectedPairs[VP.first].push_back(VP.second); 1313 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1314 } 1315 } 1316 } 1317 1318 if (Config.SplatBreaksChain) return; 1319 // Look for cases where just the second value in the pair is used by 1320 // both members of another pair (splatting). 1321 for (Value::use_iterator I = P.second->use_begin(), 1322 E = P.second->use_end(); I != E; ++I) { 1323 if (isa<LoadInst>(*I)) 1324 continue; 1325 else if ((SI = dyn_cast<StoreInst>(*I)) && 1326 P.second == SI->getPointerOperand()) 1327 continue; 1328 1329 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 1330 if ((SJ = dyn_cast<StoreInst>(*J)) && 1331 P.second == SJ->getPointerOperand()) 1332 continue; 1333 1334 if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1335 VPPair VP(P, ValuePair(*I, *J)); 1336 ConnectedPairs[VP.first].push_back(VP.second); 1337 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1338 } 1339 } 1340 } 1341 } 1342 1343 // This function figures out which pairs are connected. Two pairs are 1344 // connected if some output of the first pair forms an input to both members 1345 // of the second pair. 1346 void BBVectorize::computeConnectedPairs( 1347 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1348 DenseSet<ValuePair> &CandidatePairsSet, 1349 std::vector<Value *> &PairableInsts, 1350 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1351 DenseMap<VPPair, unsigned> &PairConnectionTypes) { 1352 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1353 PE = PairableInsts.end(); PI != PE; ++PI) { 1354 DenseMap<Value *, std::vector<Value *> >::iterator PP = 1355 CandidatePairs.find(*PI); 1356 if (PP == CandidatePairs.end()) 1357 continue; 1358 1359 for (std::vector<Value *>::iterator P = PP->second.begin(), 1360 E = PP->second.end(); P != E; ++P) 1361 computePairsConnectedTo(CandidatePairs, CandidatePairsSet, 1362 PairableInsts, ConnectedPairs, 1363 PairConnectionTypes, ValuePair(*PI, *P)); 1364 } 1365 1366 DEBUG(size_t TotalPairs = 0; 1367 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = 1368 ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) 1369 TotalPairs += I->second.size(); 1370 dbgs() << "BBV: found " << TotalPairs 1371 << " pair connections.\n"); 1372 } 1373 1374 // This function builds a set of use tuples such that <A, B> is in the set 1375 // if B is in the use tree of A. If B is in the use tree of A, then B 1376 // depends on the output of A. 1377 void BBVectorize::buildDepMap( 1378 BasicBlock &BB, 1379 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1380 std::vector<Value *> &PairableInsts, 1381 DenseSet<ValuePair> &PairableInstUsers) { 1382 DenseSet<Value *> IsInPair; 1383 for (DenseMap<Value *, std::vector<Value *> >::iterator C = 1384 CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { 1385 IsInPair.insert(C->first); 1386 IsInPair.insert(C->second.begin(), C->second.end()); 1387 } 1388 1389 // Iterate through the basic block, recording all users of each 1390 // pairable instruction. 1391 1392 BasicBlock::iterator E = BB.end(), EL = 1393 BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); 1394 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 1395 if (IsInPair.find(I) == IsInPair.end()) continue; 1396 1397 DenseSet<Value *> Users; 1398 AliasSetTracker WriteSet(*AA); 1399 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) { 1400 (void) trackUsesOfI(Users, WriteSet, I, J); 1401 1402 if (J == EL) 1403 break; 1404 } 1405 1406 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 1407 U != E; ++U) { 1408 if (IsInPair.find(*U) == IsInPair.end()) continue; 1409 PairableInstUsers.insert(ValuePair(I, *U)); 1410 } 1411 1412 if (I == EL) 1413 break; 1414 } 1415 } 1416 1417 // Returns true if an input to pair P is an output of pair Q and also an 1418 // input of pair Q is an output of pair P. If this is the case, then these 1419 // two pairs cannot be simultaneously fused. 1420 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 1421 DenseSet<ValuePair> &PairableInstUsers, 1422 DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, 1423 DenseSet<VPPair> *PairableInstUserPairSet) { 1424 // Two pairs are in conflict if they are mutual Users of eachother. 1425 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1426 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1427 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1428 PairableInstUsers.count(ValuePair(P.second, Q.second)); 1429 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1430 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1431 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1432 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1433 if (PairableInstUserMap) { 1434 // FIXME: The expensive part of the cycle check is not so much the cycle 1435 // check itself but this edge insertion procedure. This needs some 1436 // profiling and probably a different data structure. 1437 if (PUsesQ) { 1438 if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) 1439 (*PairableInstUserMap)[Q].push_back(P); 1440 } 1441 if (QUsesP) { 1442 if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) 1443 (*PairableInstUserMap)[P].push_back(Q); 1444 } 1445 } 1446 1447 return (QUsesP && PUsesQ); 1448 } 1449 1450 // This function walks the use graph of current pairs to see if, starting 1451 // from P, the walk returns to P. 1452 bool BBVectorize::pairWillFormCycle(ValuePair P, 1453 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1454 DenseSet<ValuePair> &CurrentPairs) { 1455 DEBUG(if (DebugCycleCheck) 1456 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1457 << *P.second << "\n"); 1458 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1459 // contains non-direct associations. 1460 DenseSet<ValuePair> Visited; 1461 SmallVector<ValuePair, 32> Q; 1462 // General depth-first post-order traversal: 1463 Q.push_back(P); 1464 do { 1465 ValuePair QTop = Q.pop_back_val(); 1466 Visited.insert(QTop); 1467 1468 DEBUG(if (DebugCycleCheck) 1469 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1470 << *QTop.second << "\n"); 1471 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1472 PairableInstUserMap.find(QTop); 1473 if (QQ == PairableInstUserMap.end()) 1474 continue; 1475 1476 for (std::vector<ValuePair>::iterator C = QQ->second.begin(), 1477 CE = QQ->second.end(); C != CE; ++C) { 1478 if (*C == P) { 1479 DEBUG(dbgs() 1480 << "BBV: rejected to prevent non-trivial cycle formation: " 1481 << QTop.first << " <-> " << C->second << "\n"); 1482 return true; 1483 } 1484 1485 if (CurrentPairs.count(*C) && !Visited.count(*C)) 1486 Q.push_back(*C); 1487 } 1488 } while (!Q.empty()); 1489 1490 return false; 1491 } 1492 1493 // This function builds the initial tree of connected pairs with the 1494 // pair J at the root. 1495 void BBVectorize::buildInitialTreeFor( 1496 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1497 DenseSet<ValuePair> &CandidatePairsSet, 1498 std::vector<Value *> &PairableInsts, 1499 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1500 DenseSet<ValuePair> &PairableInstUsers, 1501 DenseMap<Value *, Value *> &ChosenPairs, 1502 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1503 // Each of these pairs is viewed as the root node of a Tree. The Tree 1504 // is then walked (depth-first). As this happens, we keep track of 1505 // the pairs that compose the Tree and the maximum depth of the Tree. 1506 SmallVector<ValuePairWithDepth, 32> Q; 1507 // General depth-first post-order traversal: 1508 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1509 do { 1510 ValuePairWithDepth QTop = Q.back(); 1511 1512 // Push each child onto the queue: 1513 bool MoreChildren = false; 1514 size_t MaxChildDepth = QTop.second; 1515 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1516 ConnectedPairs.find(QTop.first); 1517 if (QQ != ConnectedPairs.end()) 1518 for (std::vector<ValuePair>::iterator k = QQ->second.begin(), 1519 ke = QQ->second.end(); k != ke; ++k) { 1520 // Make sure that this child pair is still a candidate: 1521 if (CandidatePairsSet.count(*k)) { 1522 DenseMap<ValuePair, size_t>::iterator C = Tree.find(*k); 1523 if (C == Tree.end()) { 1524 size_t d = getDepthFactor(k->first); 1525 Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); 1526 MoreChildren = true; 1527 } else { 1528 MaxChildDepth = std::max(MaxChildDepth, C->second); 1529 } 1530 } 1531 } 1532 1533 if (!MoreChildren) { 1534 // Record the current pair as part of the Tree: 1535 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1536 Q.pop_back(); 1537 } 1538 } while (!Q.empty()); 1539 } 1540 1541 // Given some initial tree, prune it by removing conflicting pairs (pairs 1542 // that cannot be simultaneously chosen for vectorization). 1543 void BBVectorize::pruneTreeFor( 1544 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1545 std::vector<Value *> &PairableInsts, 1546 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1547 DenseSet<ValuePair> &PairableInstUsers, 1548 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1549 DenseSet<VPPair> &PairableInstUserPairSet, 1550 DenseMap<Value *, Value *> &ChosenPairs, 1551 DenseMap<ValuePair, size_t> &Tree, 1552 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1553 bool UseCycleCheck) { 1554 SmallVector<ValuePairWithDepth, 32> Q; 1555 // General depth-first post-order traversal: 1556 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1557 do { 1558 ValuePairWithDepth QTop = Q.pop_back_val(); 1559 PrunedTree.insert(QTop.first); 1560 1561 // Visit each child, pruning as necessary... 1562 SmallVector<ValuePairWithDepth, 8> BestChildren; 1563 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1564 ConnectedPairs.find(QTop.first); 1565 if (QQ == ConnectedPairs.end()) 1566 continue; 1567 1568 for (std::vector<ValuePair>::iterator K = QQ->second.begin(), 1569 KE = QQ->second.end(); K != KE; ++K) { 1570 DenseMap<ValuePair, size_t>::iterator C = Tree.find(*K); 1571 if (C == Tree.end()) continue; 1572 1573 // This child is in the Tree, now we need to make sure it is the 1574 // best of any conflicting children. There could be multiple 1575 // conflicting children, so first, determine if we're keeping 1576 // this child, then delete conflicting children as necessary. 1577 1578 // It is also necessary to guard against pairing-induced 1579 // dependencies. Consider instructions a .. x .. y .. b 1580 // such that (a,b) are to be fused and (x,y) are to be fused 1581 // but a is an input to x and b is an output from y. This 1582 // means that y cannot be moved after b but x must be moved 1583 // after b for (a,b) to be fused. In other words, after 1584 // fusing (a,b) we have y .. a/b .. x where y is an input 1585 // to a/b and x is an output to a/b: x and y can no longer 1586 // be legally fused. To prevent this condition, we must 1587 // make sure that a child pair added to the Tree is not 1588 // both an input and output of an already-selected pair. 1589 1590 // Pairing-induced dependencies can also form from more complicated 1591 // cycles. The pair vs. pair conflicts are easy to check, and so 1592 // that is done explicitly for "fast rejection", and because for 1593 // child vs. child conflicts, we may prefer to keep the current 1594 // pair in preference to the already-selected child. 1595 DenseSet<ValuePair> CurrentPairs; 1596 1597 bool CanAdd = true; 1598 for (SmallVector<ValuePairWithDepth, 8>::iterator C2 1599 = BestChildren.begin(), E2 = BestChildren.end(); 1600 C2 != E2; ++C2) { 1601 if (C2->first.first == C->first.first || 1602 C2->first.first == C->first.second || 1603 C2->first.second == C->first.first || 1604 C2->first.second == C->first.second || 1605 pairsConflict(C2->first, C->first, PairableInstUsers, 1606 UseCycleCheck ? &PairableInstUserMap : 0, 1607 UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1608 if (C2->second >= C->second) { 1609 CanAdd = false; 1610 break; 1611 } 1612 1613 CurrentPairs.insert(C2->first); 1614 } 1615 } 1616 if (!CanAdd) continue; 1617 1618 // Even worse, this child could conflict with another node already 1619 // selected for the Tree. If that is the case, ignore this child. 1620 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1621 E2 = PrunedTree.end(); T != E2; ++T) { 1622 if (T->first == C->first.first || 1623 T->first == C->first.second || 1624 T->second == C->first.first || 1625 T->second == C->first.second || 1626 pairsConflict(*T, C->first, PairableInstUsers, 1627 UseCycleCheck ? &PairableInstUserMap : 0, 1628 UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1629 CanAdd = false; 1630 break; 1631 } 1632 1633 CurrentPairs.insert(*T); 1634 } 1635 if (!CanAdd) continue; 1636 1637 // And check the queue too... 1638 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1639 E2 = Q.end(); C2 != E2; ++C2) { 1640 if (C2->first.first == C->first.first || 1641 C2->first.first == C->first.second || 1642 C2->first.second == C->first.first || 1643 C2->first.second == C->first.second || 1644 pairsConflict(C2->first, C->first, PairableInstUsers, 1645 UseCycleCheck ? &PairableInstUserMap : 0, 1646 UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1647 CanAdd = false; 1648 break; 1649 } 1650 1651 CurrentPairs.insert(C2->first); 1652 } 1653 if (!CanAdd) continue; 1654 1655 // Last but not least, check for a conflict with any of the 1656 // already-chosen pairs. 1657 for (DenseMap<Value *, Value *>::iterator C2 = 1658 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1659 C2 != E2; ++C2) { 1660 if (pairsConflict(*C2, C->first, PairableInstUsers, 1661 UseCycleCheck ? &PairableInstUserMap : 0, 1662 UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1663 CanAdd = false; 1664 break; 1665 } 1666 1667 CurrentPairs.insert(*C2); 1668 } 1669 if (!CanAdd) continue; 1670 1671 // To check for non-trivial cycles formed by the addition of the 1672 // current pair we've formed a list of all relevant pairs, now use a 1673 // graph walk to check for a cycle. We start from the current pair and 1674 // walk the use tree to see if we again reach the current pair. If we 1675 // do, then the current pair is rejected. 1676 1677 // FIXME: It may be more efficient to use a topological-ordering 1678 // algorithm to improve the cycle check. This should be investigated. 1679 if (UseCycleCheck && 1680 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1681 continue; 1682 1683 // This child can be added, but we may have chosen it in preference 1684 // to an already-selected child. Check for this here, and if a 1685 // conflict is found, then remove the previously-selected child 1686 // before adding this one in its place. 1687 for (SmallVector<ValuePairWithDepth, 8>::iterator C2 1688 = BestChildren.begin(); C2 != BestChildren.end();) { 1689 if (C2->first.first == C->first.first || 1690 C2->first.first == C->first.second || 1691 C2->first.second == C->first.first || 1692 C2->first.second == C->first.second || 1693 pairsConflict(C2->first, C->first, PairableInstUsers)) 1694 C2 = BestChildren.erase(C2); 1695 else 1696 ++C2; 1697 } 1698 1699 BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); 1700 } 1701 1702 for (SmallVector<ValuePairWithDepth, 8>::iterator C 1703 = BestChildren.begin(), E2 = BestChildren.end(); 1704 C != E2; ++C) { 1705 size_t DepthF = getDepthFactor(C->first.first); 1706 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1707 } 1708 } while (!Q.empty()); 1709 } 1710 1711 // This function finds the best tree of mututally-compatible connected 1712 // pairs, given the choice of root pairs as an iterator range. 1713 void BBVectorize::findBestTreeFor( 1714 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1715 DenseSet<ValuePair> &CandidatePairsSet, 1716 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1717 std::vector<Value *> &PairableInsts, 1718 DenseSet<ValuePair> &FixedOrderPairs, 1719 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1720 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1721 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 1722 DenseSet<ValuePair> &PairableInstUsers, 1723 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1724 DenseSet<VPPair> &PairableInstUserPairSet, 1725 DenseMap<Value *, Value *> &ChosenPairs, 1726 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1727 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 1728 bool UseCycleCheck) { 1729 for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); 1730 J != JE; ++J) { 1731 ValuePair IJ(II, *J); 1732 if (!CandidatePairsSet.count(IJ)) 1733 continue; 1734 1735 // Before going any further, make sure that this pair does not 1736 // conflict with any already-selected pairs (see comment below 1737 // near the Tree pruning for more details). 1738 DenseSet<ValuePair> ChosenPairSet; 1739 bool DoesConflict = false; 1740 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1741 E = ChosenPairs.end(); C != E; ++C) { 1742 if (pairsConflict(*C, IJ, PairableInstUsers, 1743 UseCycleCheck ? &PairableInstUserMap : 0, 1744 UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1745 DoesConflict = true; 1746 break; 1747 } 1748 1749 ChosenPairSet.insert(*C); 1750 } 1751 if (DoesConflict) continue; 1752 1753 if (UseCycleCheck && 1754 pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) 1755 continue; 1756 1757 DenseMap<ValuePair, size_t> Tree; 1758 buildInitialTreeFor(CandidatePairs, CandidatePairsSet, 1759 PairableInsts, ConnectedPairs, 1760 PairableInstUsers, ChosenPairs, Tree, IJ); 1761 1762 // Because we'll keep the child with the largest depth, the largest 1763 // depth is still the same in the unpruned Tree. 1764 size_t MaxDepth = Tree.lookup(IJ); 1765 1766 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1767 << IJ.first << " <-> " << IJ.second << "} of depth " << 1768 MaxDepth << " and size " << Tree.size() << "\n"); 1769 1770 // At this point the Tree has been constructed, but, may contain 1771 // contradictory children (meaning that different children of 1772 // some tree node may be attempting to fuse the same instruction). 1773 // So now we walk the tree again, in the case of a conflict, 1774 // keep only the child with the largest depth. To break a tie, 1775 // favor the first child. 1776 1777 DenseSet<ValuePair> PrunedTree; 1778 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1779 PairableInstUsers, PairableInstUserMap, 1780 PairableInstUserPairSet, 1781 ChosenPairs, Tree, PrunedTree, IJ, UseCycleCheck); 1782 1783 int EffSize = 0; 1784 if (TTI) { 1785 DenseSet<Value *> PrunedTreeInstrs; 1786 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1787 E = PrunedTree.end(); S != E; ++S) { 1788 PrunedTreeInstrs.insert(S->first); 1789 PrunedTreeInstrs.insert(S->second); 1790 } 1791 1792 // The set of pairs that have already contributed to the total cost. 1793 DenseSet<ValuePair> IncomingPairs; 1794 1795 // If the cost model were perfect, this might not be necessary; but we 1796 // need to make sure that we don't get stuck vectorizing our own 1797 // shuffle chains. 1798 bool HasNontrivialInsts = false; 1799 1800 // The node weights represent the cost savings associated with 1801 // fusing the pair of instructions. 1802 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1803 E = PrunedTree.end(); S != E; ++S) { 1804 if (!isa<ShuffleVectorInst>(S->first) && 1805 !isa<InsertElementInst>(S->first) && 1806 !isa<ExtractElementInst>(S->first)) 1807 HasNontrivialInsts = true; 1808 1809 bool FlipOrder = false; 1810 1811 if (getDepthFactor(S->first)) { 1812 int ESContrib = CandidatePairCostSavings.find(*S)->second; 1813 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" 1814 << *S->first << " <-> " << *S->second << "} = " << 1815 ESContrib << "\n"); 1816 EffSize += ESContrib; 1817 } 1818 1819 // The edge weights contribute in a negative sense: they represent 1820 // the cost of shuffles. 1821 DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = 1822 ConnectedPairDeps.find(*S); 1823 if (SS != ConnectedPairDeps.end()) { 1824 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 1825 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1826 TE = SS->second.end(); T != TE; ++T) { 1827 VPPair Q(*S, *T); 1828 if (!PrunedTree.count(Q.second)) 1829 continue; 1830 DenseMap<VPPair, unsigned>::iterator R = 1831 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1832 assert(R != PairConnectionTypes.end() && 1833 "Cannot find pair connection type"); 1834 if (R->second == PairConnectionDirect) 1835 ++NumDepsDirect; 1836 else if (R->second == PairConnectionSwap) 1837 ++NumDepsSwap; 1838 } 1839 1840 // If there are more swaps than direct connections, then 1841 // the pair order will be flipped during fusion. So the real 1842 // number of swaps is the minimum number. 1843 FlipOrder = !FixedOrderPairs.count(*S) && 1844 ((NumDepsSwap > NumDepsDirect) || 1845 FixedOrderPairs.count(ValuePair(S->second, S->first))); 1846 1847 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1848 TE = SS->second.end(); T != TE; ++T) { 1849 VPPair Q(*S, *T); 1850 if (!PrunedTree.count(Q.second)) 1851 continue; 1852 DenseMap<VPPair, unsigned>::iterator R = 1853 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1854 assert(R != PairConnectionTypes.end() && 1855 "Cannot find pair connection type"); 1856 Type *Ty1 = Q.second.first->getType(), 1857 *Ty2 = Q.second.second->getType(); 1858 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1859 if ((R->second == PairConnectionDirect && FlipOrder) || 1860 (R->second == PairConnectionSwap && !FlipOrder) || 1861 R->second == PairConnectionSplat) { 1862 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1863 VTy, VTy); 1864 1865 if (VTy->getVectorNumElements() == 2) { 1866 if (R->second == PairConnectionSplat) 1867 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1868 TargetTransformInfo::SK_Broadcast, VTy)); 1869 else 1870 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1871 TargetTransformInfo::SK_Reverse, VTy)); 1872 } 1873 1874 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1875 *Q.second.first << " <-> " << *Q.second.second << 1876 "} -> {" << 1877 *S->first << " <-> " << *S->second << "} = " << 1878 ESContrib << "\n"); 1879 EffSize -= ESContrib; 1880 } 1881 } 1882 } 1883 1884 // Compute the cost of outgoing edges. We assume that edges outgoing 1885 // to shuffles, inserts or extracts can be merged, and so contribute 1886 // no additional cost. 1887 if (!S->first->getType()->isVoidTy()) { 1888 Type *Ty1 = S->first->getType(), 1889 *Ty2 = S->second->getType(); 1890 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1891 1892 bool NeedsExtraction = false; 1893 for (Value::use_iterator I = S->first->use_begin(), 1894 IE = S->first->use_end(); I != IE; ++I) { 1895 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1896 // Shuffle can be folded if it has no other input 1897 if (isa<UndefValue>(SI->getOperand(1))) 1898 continue; 1899 } 1900 if (isa<ExtractElementInst>(*I)) 1901 continue; 1902 if (PrunedTreeInstrs.count(*I)) 1903 continue; 1904 NeedsExtraction = true; 1905 break; 1906 } 1907 1908 if (NeedsExtraction) { 1909 int ESContrib; 1910 if (Ty1->isVectorTy()) { 1911 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1912 Ty1, VTy); 1913 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1914 TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); 1915 } else 1916 ESContrib = (int) TTI->getVectorInstrCost( 1917 Instruction::ExtractElement, VTy, 0); 1918 1919 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1920 *S->first << "} = " << ESContrib << "\n"); 1921 EffSize -= ESContrib; 1922 } 1923 1924 NeedsExtraction = false; 1925 for (Value::use_iterator I = S->second->use_begin(), 1926 IE = S->second->use_end(); I != IE; ++I) { 1927 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1928 // Shuffle can be folded if it has no other input 1929 if (isa<UndefValue>(SI->getOperand(1))) 1930 continue; 1931 } 1932 if (isa<ExtractElementInst>(*I)) 1933 continue; 1934 if (PrunedTreeInstrs.count(*I)) 1935 continue; 1936 NeedsExtraction = true; 1937 break; 1938 } 1939 1940 if (NeedsExtraction) { 1941 int ESContrib; 1942 if (Ty2->isVectorTy()) { 1943 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1944 Ty2, VTy); 1945 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1946 TargetTransformInfo::SK_ExtractSubvector, VTy, 1947 Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); 1948 } else 1949 ESContrib = (int) TTI->getVectorInstrCost( 1950 Instruction::ExtractElement, VTy, 1); 1951 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1952 *S->second << "} = " << ESContrib << "\n"); 1953 EffSize -= ESContrib; 1954 } 1955 } 1956 1957 // Compute the cost of incoming edges. 1958 if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { 1959 Instruction *S1 = cast<Instruction>(S->first), 1960 *S2 = cast<Instruction>(S->second); 1961 for (unsigned o = 0; o < S1->getNumOperands(); ++o) { 1962 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); 1963 1964 // Combining constants into vector constants (or small vector 1965 // constants into larger ones are assumed free). 1966 if (isa<Constant>(O1) && isa<Constant>(O2)) 1967 continue; 1968 1969 if (FlipOrder) 1970 std::swap(O1, O2); 1971 1972 ValuePair VP = ValuePair(O1, O2); 1973 ValuePair VPR = ValuePair(O2, O1); 1974 1975 // Internal edges are not handled here. 1976 if (PrunedTree.count(VP) || PrunedTree.count(VPR)) 1977 continue; 1978 1979 Type *Ty1 = O1->getType(), 1980 *Ty2 = O2->getType(); 1981 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1982 1983 // Combining vector operations of the same type is also assumed 1984 // folded with other operations. 1985 if (Ty1 == Ty2) { 1986 // If both are insert elements, then both can be widened. 1987 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), 1988 *IEO2 = dyn_cast<InsertElementInst>(O2); 1989 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) 1990 continue; 1991 // If both are extract elements, and both have the same input 1992 // type, then they can be replaced with a shuffle 1993 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), 1994 *EIO2 = dyn_cast<ExtractElementInst>(O2); 1995 if (EIO1 && EIO2 && 1996 EIO1->getOperand(0)->getType() == 1997 EIO2->getOperand(0)->getType()) 1998 continue; 1999 // If both are a shuffle with equal operand types and only two 2000 // unqiue operands, then they can be replaced with a single 2001 // shuffle 2002 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), 2003 *SIO2 = dyn_cast<ShuffleVectorInst>(O2); 2004 if (SIO1 && SIO2 && 2005 SIO1->getOperand(0)->getType() == 2006 SIO2->getOperand(0)->getType()) { 2007 SmallSet<Value *, 4> SIOps; 2008 SIOps.insert(SIO1->getOperand(0)); 2009 SIOps.insert(SIO1->getOperand(1)); 2010 SIOps.insert(SIO2->getOperand(0)); 2011 SIOps.insert(SIO2->getOperand(1)); 2012 if (SIOps.size() <= 2) 2013 continue; 2014 } 2015 } 2016 2017 int ESContrib; 2018 // This pair has already been formed. 2019 if (IncomingPairs.count(VP)) { 2020 continue; 2021 } else if (IncomingPairs.count(VPR)) { 2022 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2023 VTy, VTy); 2024 2025 if (VTy->getVectorNumElements() == 2) 2026 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 2027 TargetTransformInfo::SK_Reverse, VTy)); 2028 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { 2029 ESContrib = (int) TTI->getVectorInstrCost( 2030 Instruction::InsertElement, VTy, 0); 2031 ESContrib += (int) TTI->getVectorInstrCost( 2032 Instruction::InsertElement, VTy, 1); 2033 } else if (!Ty1->isVectorTy()) { 2034 // O1 needs to be inserted into a vector of size O2, and then 2035 // both need to be shuffled together. 2036 ESContrib = (int) TTI->getVectorInstrCost( 2037 Instruction::InsertElement, Ty2, 0); 2038 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2039 VTy, Ty2); 2040 } else if (!Ty2->isVectorTy()) { 2041 // O2 needs to be inserted into a vector of size O1, and then 2042 // both need to be shuffled together. 2043 ESContrib = (int) TTI->getVectorInstrCost( 2044 Instruction::InsertElement, Ty1, 0); 2045 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2046 VTy, Ty1); 2047 } else { 2048 Type *TyBig = Ty1, *TySmall = Ty2; 2049 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) 2050 std::swap(TyBig, TySmall); 2051 2052 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2053 VTy, TyBig); 2054 if (TyBig != TySmall) 2055 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2056 TyBig, TySmall); 2057 } 2058 2059 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" 2060 << *O1 << " <-> " << *O2 << "} = " << 2061 ESContrib << "\n"); 2062 EffSize -= ESContrib; 2063 IncomingPairs.insert(VP); 2064 } 2065 } 2066 } 2067 2068 if (!HasNontrivialInsts) { 2069 DEBUG(if (DebugPairSelection) dbgs() << 2070 "\tNo non-trivial instructions in tree;" 2071 " override to zero effective size\n"); 2072 EffSize = 0; 2073 } 2074 } else { 2075 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 2076 E = PrunedTree.end(); S != E; ++S) 2077 EffSize += (int) getDepthFactor(S->first); 2078 } 2079 2080 DEBUG(if (DebugPairSelection) 2081 dbgs() << "BBV: found pruned Tree for pair {" 2082 << IJ.first << " <-> " << IJ.second << "} of depth " << 2083 MaxDepth << " and size " << PrunedTree.size() << 2084 " (effective size: " << EffSize << ")\n"); 2085 if (((TTI && !UseChainDepthWithTI) || 2086 MaxDepth >= Config.ReqChainDepth) && 2087 EffSize > 0 && EffSize > BestEffSize) { 2088 BestMaxDepth = MaxDepth; 2089 BestEffSize = EffSize; 2090 BestTree = PrunedTree; 2091 } 2092 } 2093 } 2094 2095 // Given the list of candidate pairs, this function selects those 2096 // that will be fused into vector instructions. 2097 void BBVectorize::choosePairs( 2098 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 2099 DenseSet<ValuePair> &CandidatePairsSet, 2100 DenseMap<ValuePair, int> &CandidatePairCostSavings, 2101 std::vector<Value *> &PairableInsts, 2102 DenseSet<ValuePair> &FixedOrderPairs, 2103 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2104 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2105 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 2106 DenseSet<ValuePair> &PairableInstUsers, 2107 DenseMap<Value *, Value *>& ChosenPairs) { 2108 bool UseCycleCheck = 2109 CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; 2110 2111 DenseMap<Value *, std::vector<Value *> > CandidatePairs2; 2112 for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), 2113 E = CandidatePairsSet.end(); I != E; ++I) { 2114 std::vector<Value *> &JJ = CandidatePairs2[I->second]; 2115 if (JJ.empty()) JJ.reserve(32); 2116 JJ.push_back(I->first); 2117 } 2118 2119 DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; 2120 DenseSet<VPPair> PairableInstUserPairSet; 2121 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 2122 E = PairableInsts.end(); I != E; ++I) { 2123 // The number of possible pairings for this variable: 2124 size_t NumChoices = CandidatePairs.lookup(*I).size(); 2125 if (!NumChoices) continue; 2126 2127 std::vector<Value *> &JJ = CandidatePairs[*I]; 2128 2129 // The best pair to choose and its tree: 2130 size_t BestMaxDepth = 0; 2131 int BestEffSize = 0; 2132 DenseSet<ValuePair> BestTree; 2133 findBestTreeFor(CandidatePairs, CandidatePairsSet, 2134 CandidatePairCostSavings, 2135 PairableInsts, FixedOrderPairs, PairConnectionTypes, 2136 ConnectedPairs, ConnectedPairDeps, 2137 PairableInstUsers, PairableInstUserMap, 2138 PairableInstUserPairSet, ChosenPairs, 2139 BestTree, BestMaxDepth, BestEffSize, *I, JJ, 2140 UseCycleCheck); 2141 2142 if (BestTree.empty()) 2143 continue; 2144 2145 // A tree has been chosen (or not) at this point. If no tree was 2146 // chosen, then this instruction, I, cannot be paired (and is no longer 2147 // considered). 2148 2149 DEBUG(dbgs() << "BBV: selected pairs in the best tree for: " 2150 << *cast<Instruction>(*I) << "\n"); 2151 2152 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 2153 SE2 = BestTree.end(); S != SE2; ++S) { 2154 // Insert the members of this tree into the list of chosen pairs. 2155 ChosenPairs.insert(ValuePair(S->first, S->second)); 2156 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 2157 *S->second << "\n"); 2158 2159 // Remove all candidate pairs that have values in the chosen tree. 2160 std::vector<Value *> &KK = CandidatePairs[S->first], 2161 &LL = CandidatePairs2[S->second], 2162 &MM = CandidatePairs[S->second], 2163 &NN = CandidatePairs2[S->first]; 2164 for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); 2165 K != KE; ++K) { 2166 if (*K == S->second) 2167 continue; 2168 2169 CandidatePairsSet.erase(ValuePair(S->first, *K)); 2170 } 2171 for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); 2172 L != LE; ++L) { 2173 if (*L == S->first) 2174 continue; 2175 2176 CandidatePairsSet.erase(ValuePair(*L, S->second)); 2177 } 2178 for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); 2179 M != ME; ++M) { 2180 assert(*M != S->first && "Flipped pair in candidate list?"); 2181 CandidatePairsSet.erase(ValuePair(S->second, *M)); 2182 } 2183 for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); 2184 N != NE; ++N) { 2185 assert(*N != S->second && "Flipped pair in candidate list?"); 2186 CandidatePairsSet.erase(ValuePair(*N, S->first)); 2187 } 2188 } 2189 } 2190 2191 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 2192 } 2193 2194 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 2195 unsigned n = 0) { 2196 if (!I->hasName()) 2197 return ""; 2198 2199 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 2200 (n > 0 ? "." + utostr(n) : "")).str(); 2201 } 2202 2203 // Returns the value that is to be used as the pointer input to the vector 2204 // instruction that fuses I with J. 2205 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 2206 Instruction *I, Instruction *J, unsigned o) { 2207 Value *IPtr, *JPtr; 2208 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 2209 int64_t OffsetInElmts; 2210 2211 // Note: the analysis might fail here, that is why the pair order has 2212 // been precomputed (OffsetInElmts must be unused here). 2213 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 2214 IAddressSpace, JAddressSpace, 2215 OffsetInElmts, false); 2216 2217 // The pointer value is taken to be the one with the lowest offset. 2218 Value *VPtr = IPtr; 2219 2220 Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); 2221 Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); 2222 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2223 Type *VArgPtrType = PointerType::get(VArgType, 2224 cast<PointerType>(IPtr->getType())->getAddressSpace()); 2225 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 2226 /* insert before */ I); 2227 } 2228 2229 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 2230 unsigned MaskOffset, unsigned NumInElem, 2231 unsigned NumInElem1, unsigned IdxOffset, 2232 std::vector<Constant*> &Mask) { 2233 unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); 2234 for (unsigned v = 0; v < NumElem1; ++v) { 2235 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 2236 if (m < 0) { 2237 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 2238 } else { 2239 unsigned mm = m + (int) IdxOffset; 2240 if (m >= (int) NumInElem1) 2241 mm += (int) NumInElem; 2242 2243 Mask[v+MaskOffset] = 2244 ConstantInt::get(Type::getInt32Ty(Context), mm); 2245 } 2246 } 2247 } 2248 2249 // Returns the value that is to be used as the vector-shuffle mask to the 2250 // vector instruction that fuses I with J. 2251 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 2252 Instruction *I, Instruction *J) { 2253 // This is the shuffle mask. We need to append the second 2254 // mask to the first, and the numbers need to be adjusted. 2255 2256 Type *ArgTypeI = I->getType(); 2257 Type *ArgTypeJ = J->getType(); 2258 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2259 2260 unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); 2261 2262 // Get the total number of elements in the fused vector type. 2263 // By definition, this must equal the number of elements in 2264 // the final mask. 2265 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 2266 std::vector<Constant*> Mask(NumElem); 2267 2268 Type *OpTypeI = I->getOperand(0)->getType(); 2269 unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); 2270 Type *OpTypeJ = J->getOperand(0)->getType(); 2271 unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); 2272 2273 // The fused vector will be: 2274 // ----------------------------------------------------- 2275 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | 2276 // ----------------------------------------------------- 2277 // from which we'll extract NumElem total elements (where the first NumElemI 2278 // of them come from the mask in I and the remainder come from the mask 2279 // in J. 2280 2281 // For the mask from the first pair... 2282 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, 2283 0, Mask); 2284 2285 // For the mask from the second pair... 2286 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, 2287 NumInElemI, Mask); 2288 2289 return ConstantVector::get(Mask); 2290 } 2291 2292 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, 2293 Instruction *J, unsigned o, Value *&LOp, 2294 unsigned numElemL, 2295 Type *ArgTypeL, Type *ArgTypeH, 2296 bool IBeforeJ, unsigned IdxOff) { 2297 bool ExpandedIEChain = false; 2298 if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { 2299 // If we have a pure insertelement chain, then this can be rewritten 2300 // into a chain that directly builds the larger type. 2301 if (isPureIEChain(LIE)) { 2302 SmallVector<Value *, 8> VectElemts(numElemL, 2303 UndefValue::get(ArgTypeL->getScalarType())); 2304 InsertElementInst *LIENext = LIE; 2305 do { 2306 unsigned Idx = 2307 cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); 2308 VectElemts[Idx] = LIENext->getOperand(1); 2309 } while ((LIENext = 2310 dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); 2311 2312 LIENext = 0; 2313 Value *LIEPrev = UndefValue::get(ArgTypeH); 2314 for (unsigned i = 0; i < numElemL; ++i) { 2315 if (isa<UndefValue>(VectElemts[i])) continue; 2316 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], 2317 ConstantInt::get(Type::getInt32Ty(Context), 2318 i + IdxOff), 2319 getReplacementName(IBeforeJ ? I : J, 2320 true, o, i+1)); 2321 LIENext->insertBefore(IBeforeJ ? J : I); 2322 LIEPrev = LIENext; 2323 } 2324 2325 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); 2326 ExpandedIEChain = true; 2327 } 2328 } 2329 2330 return ExpandedIEChain; 2331 } 2332 2333 // Returns the value to be used as the specified operand of the vector 2334 // instruction that fuses I with J. 2335 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 2336 Instruction *J, unsigned o, bool IBeforeJ) { 2337 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2338 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 2339 2340 // Compute the fused vector type for this operand 2341 Type *ArgTypeI = I->getOperand(o)->getType(); 2342 Type *ArgTypeJ = J->getOperand(o)->getType(); 2343 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2344 2345 Instruction *L = I, *H = J; 2346 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; 2347 2348 unsigned numElemL; 2349 if (ArgTypeL->isVectorTy()) 2350 numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); 2351 else 2352 numElemL = 1; 2353 2354 unsigned numElemH; 2355 if (ArgTypeH->isVectorTy()) 2356 numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); 2357 else 2358 numElemH = 1; 2359 2360 Value *LOp = L->getOperand(o); 2361 Value *HOp = H->getOperand(o); 2362 unsigned numElem = VArgType->getNumElements(); 2363 2364 // First, we check if we can reuse the "original" vector outputs (if these 2365 // exist). We might need a shuffle. 2366 ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); 2367 ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); 2368 ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); 2369 ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); 2370 2371 // FIXME: If we're fusing shuffle instructions, then we can't apply this 2372 // optimization. The input vectors to the shuffle might be a different 2373 // length from the shuffle outputs. Unfortunately, the replacement 2374 // shuffle mask has already been formed, and the mask entries are sensitive 2375 // to the sizes of the inputs. 2376 bool IsSizeChangeShuffle = 2377 isa<ShuffleVectorInst>(L) && 2378 (LOp->getType() != L->getType() || HOp->getType() != H->getType()); 2379 2380 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { 2381 // We can have at most two unique vector inputs. 2382 bool CanUseInputs = true; 2383 Value *I1, *I2 = 0; 2384 if (LEE) { 2385 I1 = LEE->getOperand(0); 2386 } else { 2387 I1 = LSV->getOperand(0); 2388 I2 = LSV->getOperand(1); 2389 if (I2 == I1 || isa<UndefValue>(I2)) 2390 I2 = 0; 2391 } 2392 2393 if (HEE) { 2394 Value *I3 = HEE->getOperand(0); 2395 if (!I2 && I3 != I1) 2396 I2 = I3; 2397 else if (I3 != I1 && I3 != I2) 2398 CanUseInputs = false; 2399 } else { 2400 Value *I3 = HSV->getOperand(0); 2401 if (!I2 && I3 != I1) 2402 I2 = I3; 2403 else if (I3 != I1 && I3 != I2) 2404 CanUseInputs = false; 2405 2406 if (CanUseInputs) { 2407 Value *I4 = HSV->getOperand(1); 2408 if (!isa<UndefValue>(I4)) { 2409 if (!I2 && I4 != I1) 2410 I2 = I4; 2411 else if (I4 != I1 && I4 != I2) 2412 CanUseInputs = false; 2413 } 2414 } 2415 } 2416 2417 if (CanUseInputs) { 2418 unsigned LOpElem = 2419 cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) 2420 ->getNumElements(); 2421 unsigned HOpElem = 2422 cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) 2423 ->getNumElements(); 2424 2425 // We have one or two input vectors. We need to map each index of the 2426 // operands to the index of the original vector. 2427 SmallVector<std::pair<int, int>, 8> II(numElem); 2428 for (unsigned i = 0; i < numElemL; ++i) { 2429 int Idx, INum; 2430 if (LEE) { 2431 Idx = 2432 cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); 2433 INum = LEE->getOperand(0) == I1 ? 0 : 1; 2434 } else { 2435 Idx = LSV->getMaskValue(i); 2436 if (Idx < (int) LOpElem) { 2437 INum = LSV->getOperand(0) == I1 ? 0 : 1; 2438 } else { 2439 Idx -= LOpElem; 2440 INum = LSV->getOperand(1) == I1 ? 0 : 1; 2441 } 2442 } 2443 2444 II[i] = std::pair<int, int>(Idx, INum); 2445 } 2446 for (unsigned i = 0; i < numElemH; ++i) { 2447 int Idx, INum; 2448 if (HEE) { 2449 Idx = 2450 cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); 2451 INum = HEE->getOperand(0) == I1 ? 0 : 1; 2452 } else { 2453 Idx = HSV->getMaskValue(i); 2454 if (Idx < (int) HOpElem) { 2455 INum = HSV->getOperand(0) == I1 ? 0 : 1; 2456 } else { 2457 Idx -= HOpElem; 2458 INum = HSV->getOperand(1) == I1 ? 0 : 1; 2459 } 2460 } 2461 2462 II[i + numElemL] = std::pair<int, int>(Idx, INum); 2463 } 2464 2465 // We now have an array which tells us from which index of which 2466 // input vector each element of the operand comes. 2467 VectorType *I1T = cast<VectorType>(I1->getType()); 2468 unsigned I1Elem = I1T->getNumElements(); 2469 2470 if (!I2) { 2471 // In this case there is only one underlying vector input. Check for 2472 // the trivial case where we can use the input directly. 2473 if (I1Elem == numElem) { 2474 bool ElemInOrder = true; 2475 for (unsigned i = 0; i < numElem; ++i) { 2476 if (II[i].first != (int) i && II[i].first != -1) { 2477 ElemInOrder = false; 2478 break; 2479 } 2480 } 2481 2482 if (ElemInOrder) 2483 return I1; 2484 } 2485 2486 // A shuffle is needed. 2487 std::vector<Constant *> Mask(numElem); 2488 for (unsigned i = 0; i < numElem; ++i) { 2489 int Idx = II[i].first; 2490 if (Idx == -1) 2491 Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); 2492 else 2493 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2494 } 2495 2496 Instruction *S = 2497 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2498 ConstantVector::get(Mask), 2499 getReplacementName(IBeforeJ ? I : J, 2500 true, o)); 2501 S->insertBefore(IBeforeJ ? J : I); 2502 return S; 2503 } 2504 2505 VectorType *I2T = cast<VectorType>(I2->getType()); 2506 unsigned I2Elem = I2T->getNumElements(); 2507 2508 // This input comes from two distinct vectors. The first step is to 2509 // make sure that both vectors are the same length. If not, the 2510 // smaller one will need to grow before they can be shuffled together. 2511 if (I1Elem < I2Elem) { 2512 std::vector<Constant *> Mask(I2Elem); 2513 unsigned v = 0; 2514 for (; v < I1Elem; ++v) 2515 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2516 for (; v < I2Elem; ++v) 2517 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2518 2519 Instruction *NewI1 = 2520 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2521 ConstantVector::get(Mask), 2522 getReplacementName(IBeforeJ ? I : J, 2523 true, o, 1)); 2524 NewI1->insertBefore(IBeforeJ ? J : I); 2525 I1 = NewI1; 2526 I1T = I2T; 2527 I1Elem = I2Elem; 2528 } else if (I1Elem > I2Elem) { 2529 std::vector<Constant *> Mask(I1Elem); 2530 unsigned v = 0; 2531 for (; v < I2Elem; ++v) 2532 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2533 for (; v < I1Elem; ++v) 2534 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2535 2536 Instruction *NewI2 = 2537 new ShuffleVectorInst(I2, UndefValue::get(I2T), 2538 ConstantVector::get(Mask), 2539 getReplacementName(IBeforeJ ? I : J, 2540 true, o, 1)); 2541 NewI2->insertBefore(IBeforeJ ? J : I); 2542 I2 = NewI2; 2543 I2T = I1T; 2544 I2Elem = I1Elem; 2545 } 2546 2547 // Now that both I1 and I2 are the same length we can shuffle them 2548 // together (and use the result). 2549 std::vector<Constant *> Mask(numElem); 2550 for (unsigned v = 0; v < numElem; ++v) { 2551 if (II[v].first == -1) { 2552 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2553 } else { 2554 int Idx = II[v].first + II[v].second * I1Elem; 2555 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2556 } 2557 } 2558 2559 Instruction *NewOp = 2560 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), 2561 getReplacementName(IBeforeJ ? I : J, true, o)); 2562 NewOp->insertBefore(IBeforeJ ? J : I); 2563 return NewOp; 2564 } 2565 } 2566 2567 Type *ArgType = ArgTypeL; 2568 if (numElemL < numElemH) { 2569 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, 2570 ArgTypeL, VArgType, IBeforeJ, 1)) { 2571 // This is another short-circuit case: we're combining a scalar into 2572 // a vector that is formed by an IE chain. We've just expanded the IE 2573 // chain, now insert the scalar and we're done. 2574 2575 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, 2576 getReplacementName(IBeforeJ ? I : J, true, o)); 2577 S->insertBefore(IBeforeJ ? J : I); 2578 return S; 2579 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, 2580 ArgTypeH, IBeforeJ)) { 2581 // The two vector inputs to the shuffle must be the same length, 2582 // so extend the smaller vector to be the same length as the larger one. 2583 Instruction *NLOp; 2584 if (numElemL > 1) { 2585 2586 std::vector<Constant *> Mask(numElemH); 2587 unsigned v = 0; 2588 for (; v < numElemL; ++v) 2589 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2590 for (; v < numElemH; ++v) 2591 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2592 2593 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), 2594 ConstantVector::get(Mask), 2595 getReplacementName(IBeforeJ ? I : J, 2596 true, o, 1)); 2597 } else { 2598 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, 2599 getReplacementName(IBeforeJ ? I : J, 2600 true, o, 1)); 2601 } 2602 2603 NLOp->insertBefore(IBeforeJ ? J : I); 2604 LOp = NLOp; 2605 } 2606 2607 ArgType = ArgTypeH; 2608 } else if (numElemL > numElemH) { 2609 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, 2610 ArgTypeH, VArgType, IBeforeJ)) { 2611 Instruction *S = 2612 InsertElementInst::Create(LOp, HOp, 2613 ConstantInt::get(Type::getInt32Ty(Context), 2614 numElemL), 2615 getReplacementName(IBeforeJ ? I : J, 2616 true, o)); 2617 S->insertBefore(IBeforeJ ? J : I); 2618 return S; 2619 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, 2620 ArgTypeL, IBeforeJ)) { 2621 Instruction *NHOp; 2622 if (numElemH > 1) { 2623 std::vector<Constant *> Mask(numElemL); 2624 unsigned v = 0; 2625 for (; v < numElemH; ++v) 2626 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2627 for (; v < numElemL; ++v) 2628 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2629 2630 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), 2631 ConstantVector::get(Mask), 2632 getReplacementName(IBeforeJ ? I : J, 2633 true, o, 1)); 2634 } else { 2635 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, 2636 getReplacementName(IBeforeJ ? I : J, 2637 true, o, 1)); 2638 } 2639 2640 NHOp->insertBefore(IBeforeJ ? J : I); 2641 HOp = NHOp; 2642 } 2643 } 2644 2645 if (ArgType->isVectorTy()) { 2646 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 2647 std::vector<Constant*> Mask(numElem); 2648 for (unsigned v = 0; v < numElem; ++v) { 2649 unsigned Idx = v; 2650 // If the low vector was expanded, we need to skip the extra 2651 // undefined entries. 2652 if (v >= numElemL && numElemH > numElemL) 2653 Idx += (numElemH - numElemL); 2654 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2655 } 2656 2657 Instruction *BV = new ShuffleVectorInst(LOp, HOp, 2658 ConstantVector::get(Mask), 2659 getReplacementName(IBeforeJ ? I : J, true, o)); 2660 BV->insertBefore(IBeforeJ ? J : I); 2661 return BV; 2662 } 2663 2664 Instruction *BV1 = InsertElementInst::Create( 2665 UndefValue::get(VArgType), LOp, CV0, 2666 getReplacementName(IBeforeJ ? I : J, 2667 true, o, 1)); 2668 BV1->insertBefore(IBeforeJ ? J : I); 2669 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, 2670 getReplacementName(IBeforeJ ? I : J, 2671 true, o, 2)); 2672 BV2->insertBefore(IBeforeJ ? J : I); 2673 return BV2; 2674 } 2675 2676 // This function creates an array of values that will be used as the inputs 2677 // to the vector instruction that fuses I with J. 2678 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 2679 Instruction *I, Instruction *J, 2680 SmallVector<Value *, 3> &ReplacedOperands, 2681 bool IBeforeJ) { 2682 unsigned NumOperands = I->getNumOperands(); 2683 2684 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 2685 // Iterate backward so that we look at the store pointer 2686 // first and know whether or not we need to flip the inputs. 2687 2688 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 2689 // This is the pointer for a load/store instruction. 2690 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); 2691 continue; 2692 } else if (isa<CallInst>(I)) { 2693 Function *F = cast<CallInst>(I)->getCalledFunction(); 2694 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 2695 if (o == NumOperands-1) { 2696 BasicBlock &BB = *I->getParent(); 2697 2698 Module *M = BB.getParent()->getParent(); 2699 Type *ArgTypeI = I->getType(); 2700 Type *ArgTypeJ = J->getType(); 2701 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2702 2703 ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); 2704 continue; 2705 } else if (IID == Intrinsic::powi && o == 1) { 2706 // The second argument of powi is a single integer and we've already 2707 // checked that both arguments are equal. As a result, we just keep 2708 // I's second argument. 2709 ReplacedOperands[o] = I->getOperand(o); 2710 continue; 2711 } 2712 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 2713 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 2714 continue; 2715 } 2716 2717 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); 2718 } 2719 } 2720 2721 // This function creates two values that represent the outputs of the 2722 // original I and J instructions. These are generally vector shuffles 2723 // or extracts. In many cases, these will end up being unused and, thus, 2724 // eliminated by later passes. 2725 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 2726 Instruction *J, Instruction *K, 2727 Instruction *&InsertionPt, 2728 Instruction *&K1, Instruction *&K2) { 2729 if (isa<StoreInst>(I)) { 2730 AA->replaceWithNewValue(I, K); 2731 AA->replaceWithNewValue(J, K); 2732 } else { 2733 Type *IType = I->getType(); 2734 Type *JType = J->getType(); 2735 2736 VectorType *VType = getVecTypeForPair(IType, JType); 2737 unsigned numElem = VType->getNumElements(); 2738 2739 unsigned numElemI, numElemJ; 2740 if (IType->isVectorTy()) 2741 numElemI = cast<VectorType>(IType)->getNumElements(); 2742 else 2743 numElemI = 1; 2744 2745 if (JType->isVectorTy()) 2746 numElemJ = cast<VectorType>(JType)->getNumElements(); 2747 else 2748 numElemJ = 1; 2749 2750 if (IType->isVectorTy()) { 2751 std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); 2752 for (unsigned v = 0; v < numElemI; ++v) { 2753 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2754 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); 2755 } 2756 2757 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 2758 ConstantVector::get( Mask1), 2759 getReplacementName(K, false, 1)); 2760 } else { 2761 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2762 K1 = ExtractElementInst::Create(K, CV0, 2763 getReplacementName(K, false, 1)); 2764 } 2765 2766 if (JType->isVectorTy()) { 2767 std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); 2768 for (unsigned v = 0; v < numElemJ; ++v) { 2769 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2770 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); 2771 } 2772 2773 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 2774 ConstantVector::get( Mask2), 2775 getReplacementName(K, false, 2)); 2776 } else { 2777 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); 2778 K2 = ExtractElementInst::Create(K, CV1, 2779 getReplacementName(K, false, 2)); 2780 } 2781 2782 K1->insertAfter(K); 2783 K2->insertAfter(K1); 2784 InsertionPt = K2; 2785 } 2786 } 2787 2788 // Move all uses of the function I (including pairing-induced uses) after J. 2789 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 2790 DenseSet<ValuePair> &LoadMoveSetPairs, 2791 Instruction *I, Instruction *J) { 2792 // Skip to the first instruction past I. 2793 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2794 2795 DenseSet<Value *> Users; 2796 AliasSetTracker WriteSet(*AA); 2797 for (; cast<Instruction>(L) != J; ++L) 2798 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); 2799 2800 assert(cast<Instruction>(L) == J && 2801 "Tracking has not proceeded far enough to check for dependencies"); 2802 // If J is now in the use set of I, then trackUsesOfI will return true 2803 // and we have a dependency cycle (and the fusing operation must abort). 2804 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); 2805 } 2806 2807 // Move all uses of the function I (including pairing-induced uses) after J. 2808 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 2809 DenseSet<ValuePair> &LoadMoveSetPairs, 2810 Instruction *&InsertionPt, 2811 Instruction *I, Instruction *J) { 2812 // Skip to the first instruction past I. 2813 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2814 2815 DenseSet<Value *> Users; 2816 AliasSetTracker WriteSet(*AA); 2817 for (; cast<Instruction>(L) != J;) { 2818 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { 2819 // Move this instruction 2820 Instruction *InstToMove = L; ++L; 2821 2822 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 2823 " to after " << *InsertionPt << "\n"); 2824 InstToMove->removeFromParent(); 2825 InstToMove->insertAfter(InsertionPt); 2826 InsertionPt = InstToMove; 2827 } else { 2828 ++L; 2829 } 2830 } 2831 } 2832 2833 // Collect all load instruction that are in the move set of a given first 2834 // pair member. These loads depend on the first instruction, I, and so need 2835 // to be moved after J (the second instruction) when the pair is fused. 2836 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 2837 DenseMap<Value *, Value *> &ChosenPairs, 2838 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2839 DenseSet<ValuePair> &LoadMoveSetPairs, 2840 Instruction *I) { 2841 // Skip to the first instruction past I. 2842 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2843 2844 DenseSet<Value *> Users; 2845 AliasSetTracker WriteSet(*AA); 2846 2847 // Note: We cannot end the loop when we reach J because J could be moved 2848 // farther down the use chain by another instruction pairing. Also, J 2849 // could be before I if this is an inverted input. 2850 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 2851 if (trackUsesOfI(Users, WriteSet, I, L)) { 2852 if (L->mayReadFromMemory()) { 2853 LoadMoveSet[L].push_back(I); 2854 LoadMoveSetPairs.insert(ValuePair(L, I)); 2855 } 2856 } 2857 } 2858 } 2859 2860 // In cases where both load/stores and the computation of their pointers 2861 // are chosen for vectorization, we can end up in a situation where the 2862 // aliasing analysis starts returning different query results as the 2863 // process of fusing instruction pairs continues. Because the algorithm 2864 // relies on finding the same use trees here as were found earlier, we'll 2865 // need to precompute the necessary aliasing information here and then 2866 // manually update it during the fusion process. 2867 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 2868 std::vector<Value *> &PairableInsts, 2869 DenseMap<Value *, Value *> &ChosenPairs, 2870 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2871 DenseSet<ValuePair> &LoadMoveSetPairs) { 2872 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 2873 PIE = PairableInsts.end(); PI != PIE; ++PI) { 2874 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 2875 if (P == ChosenPairs.end()) continue; 2876 2877 Instruction *I = cast<Instruction>(P->first); 2878 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, 2879 LoadMoveSetPairs, I); 2880 } 2881 } 2882 2883 // When the first instruction in each pair is cloned, it will inherit its 2884 // parent's metadata. This metadata must be combined with that of the other 2885 // instruction in a safe way. 2886 void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { 2887 SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; 2888 K->getAllMetadataOtherThanDebugLoc(Metadata); 2889 for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { 2890 unsigned Kind = Metadata[i].first; 2891 MDNode *JMD = J->getMetadata(Kind); 2892 MDNode *KMD = Metadata[i].second; 2893 2894 switch (Kind) { 2895 default: 2896 K->setMetadata(Kind, 0); // Remove unknown metadata 2897 break; 2898 case LLVMContext::MD_tbaa: 2899 K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 2900 break; 2901 case LLVMContext::MD_fpmath: 2902 K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 2903 break; 2904 } 2905 } 2906 } 2907 2908 // This function fuses the chosen instruction pairs into vector instructions, 2909 // taking care preserve any needed scalar outputs and, then, it reorders the 2910 // remaining instructions as needed (users of the first member of the pair 2911 // need to be moved to after the location of the second member of the pair 2912 // because the vector instruction is inserted in the location of the pair's 2913 // second member). 2914 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 2915 std::vector<Value *> &PairableInsts, 2916 DenseMap<Value *, Value *> &ChosenPairs, 2917 DenseSet<ValuePair> &FixedOrderPairs, 2918 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2919 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2920 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { 2921 LLVMContext& Context = BB.getContext(); 2922 2923 // During the vectorization process, the order of the pairs to be fused 2924 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 2925 // list. After a pair is fused, the flipped pair is removed from the list. 2926 DenseSet<ValuePair> FlippedPairs; 2927 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 2928 E = ChosenPairs.end(); P != E; ++P) 2929 FlippedPairs.insert(ValuePair(P->second, P->first)); 2930 for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), 2931 E = FlippedPairs.end(); P != E; ++P) 2932 ChosenPairs.insert(*P); 2933 2934 DenseMap<Value *, std::vector<Value *> > LoadMoveSet; 2935 DenseSet<ValuePair> LoadMoveSetPairs; 2936 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, 2937 LoadMoveSet, LoadMoveSetPairs); 2938 2939 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 2940 2941 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 2942 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 2943 if (P == ChosenPairs.end()) { 2944 ++PI; 2945 continue; 2946 } 2947 2948 if (getDepthFactor(P->first) == 0) { 2949 // These instructions are not really fused, but are tracked as though 2950 // they are. Any case in which it would be interesting to fuse them 2951 // will be taken care of by InstCombine. 2952 --NumFusedOps; 2953 ++PI; 2954 continue; 2955 } 2956 2957 Instruction *I = cast<Instruction>(P->first), 2958 *J = cast<Instruction>(P->second); 2959 2960 DEBUG(dbgs() << "BBV: fusing: " << *I << 2961 " <-> " << *J << "\n"); 2962 2963 // Remove the pair and flipped pair from the list. 2964 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 2965 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 2966 ChosenPairs.erase(FP); 2967 ChosenPairs.erase(P); 2968 2969 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { 2970 DEBUG(dbgs() << "BBV: fusion of: " << *I << 2971 " <-> " << *J << 2972 " aborted because of non-trivial dependency cycle\n"); 2973 --NumFusedOps; 2974 ++PI; 2975 continue; 2976 } 2977 2978 // If the pair must have the other order, then flip it. 2979 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); 2980 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { 2981 // This pair does not have a fixed order, and so we might want to 2982 // flip it if that will yield fewer shuffles. We count the number 2983 // of dependencies connected via swaps, and those directly connected, 2984 // and flip the order if the number of swaps is greater. 2985 bool OrigOrder = true; 2986 DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = 2987 ConnectedPairDeps.find(ValuePair(I, J)); 2988 if (IJ == ConnectedPairDeps.end()) { 2989 IJ = ConnectedPairDeps.find(ValuePair(J, I)); 2990 OrigOrder = false; 2991 } 2992 2993 if (IJ != ConnectedPairDeps.end()) { 2994 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 2995 for (std::vector<ValuePair>::iterator T = IJ->second.begin(), 2996 TE = IJ->second.end(); T != TE; ++T) { 2997 VPPair Q(IJ->first, *T); 2998 DenseMap<VPPair, unsigned>::iterator R = 2999 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 3000 assert(R != PairConnectionTypes.end() && 3001 "Cannot find pair connection type"); 3002 if (R->second == PairConnectionDirect) 3003 ++NumDepsDirect; 3004 else if (R->second == PairConnectionSwap) 3005 ++NumDepsSwap; 3006 } 3007 3008 if (!OrigOrder) 3009 std::swap(NumDepsDirect, NumDepsSwap); 3010 3011 if (NumDepsSwap > NumDepsDirect) { 3012 FlipPairOrder = true; 3013 DEBUG(dbgs() << "BBV: reordering pair: " << *I << 3014 " <-> " << *J << "\n"); 3015 } 3016 } 3017 } 3018 3019 Instruction *L = I, *H = J; 3020 if (FlipPairOrder) 3021 std::swap(H, L); 3022 3023 // If the pair being fused uses the opposite order from that in the pair 3024 // connection map, then we need to flip the types. 3025 DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = 3026 ConnectedPairs.find(ValuePair(H, L)); 3027 if (HL != ConnectedPairs.end()) 3028 for (std::vector<ValuePair>::iterator T = HL->second.begin(), 3029 TE = HL->second.end(); T != TE; ++T) { 3030 VPPair Q(HL->first, *T); 3031 DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); 3032 assert(R != PairConnectionTypes.end() && 3033 "Cannot find pair connection type"); 3034 if (R->second == PairConnectionDirect) 3035 R->second = PairConnectionSwap; 3036 else if (R->second == PairConnectionSwap) 3037 R->second = PairConnectionDirect; 3038 } 3039 3040 bool LBeforeH = !FlipPairOrder; 3041 unsigned NumOperands = I->getNumOperands(); 3042 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 3043 getReplacementInputsForPair(Context, L, H, ReplacedOperands, 3044 LBeforeH); 3045 3046 // Make a copy of the original operation, change its type to the vector 3047 // type and replace its operands with the vector operands. 3048 Instruction *K = L->clone(); 3049 if (L->hasName()) 3050 K->takeName(L); 3051 else if (H->hasName()) 3052 K->takeName(H); 3053 3054 if (!isa<StoreInst>(K)) 3055 K->mutateType(getVecTypeForPair(L->getType(), H->getType())); 3056 3057 combineMetadata(K, H); 3058 K->intersectOptionalDataWith(H); 3059 3060 for (unsigned o = 0; o < NumOperands; ++o) 3061 K->setOperand(o, ReplacedOperands[o]); 3062 3063 K->insertAfter(J); 3064 3065 // Instruction insertion point: 3066 Instruction *InsertionPt = K; 3067 Instruction *K1 = 0, *K2 = 0; 3068 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); 3069 3070 // The use tree of the first original instruction must be moved to after 3071 // the location of the second instruction. The entire use tree of the 3072 // first instruction is disjoint from the input tree of the second 3073 // (by definition), and so commutes with it. 3074 3075 moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); 3076 3077 if (!isa<StoreInst>(I)) { 3078 L->replaceAllUsesWith(K1); 3079 H->replaceAllUsesWith(K2); 3080 AA->replaceWithNewValue(L, K1); 3081 AA->replaceWithNewValue(H, K2); 3082 } 3083 3084 // Instructions that may read from memory may be in the load move set. 3085 // Once an instruction is fused, we no longer need its move set, and so 3086 // the values of the map never need to be updated. However, when a load 3087 // is fused, we need to merge the entries from both instructions in the 3088 // pair in case those instructions were in the move set of some other 3089 // yet-to-be-fused pair. The loads in question are the keys of the map. 3090 if (I->mayReadFromMemory()) { 3091 std::vector<ValuePair> NewSetMembers; 3092 DenseMap<Value *, std::vector<Value *> >::iterator II = 3093 LoadMoveSet.find(I); 3094 if (II != LoadMoveSet.end()) 3095 for (std::vector<Value *>::iterator N = II->second.begin(), 3096 NE = II->second.end(); N != NE; ++N) 3097 NewSetMembers.push_back(ValuePair(K, *N)); 3098 DenseMap<Value *, std::vector<Value *> >::iterator JJ = 3099 LoadMoveSet.find(J); 3100 if (JJ != LoadMoveSet.end()) 3101 for (std::vector<Value *>::iterator N = JJ->second.begin(), 3102 NE = JJ->second.end(); N != NE; ++N) 3103 NewSetMembers.push_back(ValuePair(K, *N)); 3104 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 3105 AE = NewSetMembers.end(); A != AE; ++A) { 3106 LoadMoveSet[A->first].push_back(A->second); 3107 LoadMoveSetPairs.insert(*A); 3108 } 3109 } 3110 3111 // Before removing I, set the iterator to the next instruction. 3112 PI = llvm::next(BasicBlock::iterator(I)); 3113 if (cast<Instruction>(PI) == J) 3114 ++PI; 3115 3116 SE->forgetValue(I); 3117 SE->forgetValue(J); 3118 I->eraseFromParent(); 3119 J->eraseFromParent(); 3120 3121 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << 3122 BB << "\n"); 3123 } 3124 3125 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 3126 } 3127} 3128 3129char BBVectorize::ID = 0; 3130static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 3131INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3132INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3133INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3134INITIALIZE_PASS_DEPENDENCY(DominatorTree) 3135INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3136INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3137 3138BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 3139 return new BBVectorize(C); 3140} 3141 3142bool 3143llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 3144 BBVectorize BBVectorizer(P, C); 3145 return BBVectorizer.vectorizeBB(BB); 3146} 3147 3148//===----------------------------------------------------------------------===// 3149VectorizeConfig::VectorizeConfig() { 3150 VectorBits = ::VectorBits; 3151 VectorizeBools = !::NoBools; 3152 VectorizeInts = !::NoInts; 3153 VectorizeFloats = !::NoFloats; 3154 VectorizePointers = !::NoPointers; 3155 VectorizeCasts = !::NoCasts; 3156 VectorizeMath = !::NoMath; 3157 VectorizeFMA = !::NoFMA; 3158 VectorizeSelect = !::NoSelect; 3159 VectorizeCmp = !::NoCmp; 3160 VectorizeGEP = !::NoGEP; 3161 VectorizeMemOps = !::NoMemOps; 3162 AlignedOnly = ::AlignedOnly; 3163 ReqChainDepth= ::ReqChainDepth; 3164 SearchLimit = ::SearchLimit; 3165 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 3166 SplatBreaksChain = ::SplatBreaksChain; 3167 MaxInsts = ::MaxInsts; 3168 MaxIter = ::MaxIter; 3169 Pow2LenOnly = ::Pow2LenOnly; 3170 NoMemOpBoost = ::NoMemOpBoost; 3171 FastDep = ::FastDep; 3172} 3173