SROA.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/ADT/STLExtras.h" 29#include "llvm/ADT/SetVector.h" 30#include "llvm/ADT/SmallVector.h" 31#include "llvm/ADT/Statistic.h" 32#include "llvm/Analysis/Loads.h" 33#include "llvm/Analysis/PtrUseVisitor.h" 34#include "llvm/Analysis/ValueTracking.h" 35#include "llvm/IR/Constants.h" 36#include "llvm/IR/DIBuilder.h" 37#include "llvm/IR/DataLayout.h" 38#include "llvm/IR/DebugInfo.h" 39#include "llvm/IR/DerivedTypes.h" 40#include "llvm/IR/Dominators.h" 41#include "llvm/IR/Function.h" 42#include "llvm/IR/IRBuilder.h" 43#include "llvm/IR/InstVisitor.h" 44#include "llvm/IR/Instructions.h" 45#include "llvm/IR/IntrinsicInst.h" 46#include "llvm/IR/LLVMContext.h" 47#include "llvm/IR/Operator.h" 48#include "llvm/Pass.h" 49#include "llvm/Support/CommandLine.h" 50#include "llvm/Support/Compiler.h" 51#include "llvm/Support/Debug.h" 52#include "llvm/Support/ErrorHandling.h" 53#include "llvm/Support/MathExtras.h" 54#include "llvm/Support/TimeValue.h" 55#include "llvm/Support/raw_ostream.h" 56#include "llvm/Transforms/Utils/Local.h" 57#include "llvm/Transforms/Utils/PromoteMemToReg.h" 58#include "llvm/Transforms/Utils/SSAUpdater.h" 59 60#if __cplusplus >= 201103L && !defined(NDEBUG) 61// We only use this for a debug check in C++11 62#include <random> 63#endif 64 65using namespace llvm; 66 67STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 68STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 69STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); 70STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); 71STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); 72STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 73STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 74STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 75STATISTIC(NumDeleted, "Number of instructions deleted"); 76STATISTIC(NumVectorized, "Number of vectorized aggregates"); 77 78/// Hidden option to force the pass to not use DomTree and mem2reg, instead 79/// forming SSA values through the SSAUpdater infrastructure. 80static cl::opt<bool> 81ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 82 83/// Hidden option to enable randomly shuffling the slices to help uncover 84/// instability in their order. 85static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", 86 cl::init(false), cl::Hidden); 87 88/// Hidden option to experiment with completely strict handling of inbounds 89/// GEPs. 90static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", 91 cl::init(false), cl::Hidden); 92 93namespace { 94/// \brief A custom IRBuilder inserter which prefixes all names if they are 95/// preserved. 96template <bool preserveNames = true> 97class IRBuilderPrefixedInserter : 98 public IRBuilderDefaultInserter<preserveNames> { 99 std::string Prefix; 100 101public: 102 void SetNamePrefix(const Twine &P) { Prefix = P.str(); } 103 104protected: 105 void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, 106 BasicBlock::iterator InsertPt) const { 107 IRBuilderDefaultInserter<preserveNames>::InsertHelper( 108 I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt); 109 } 110}; 111 112// Specialization for not preserving the name is trivial. 113template <> 114class IRBuilderPrefixedInserter<false> : 115 public IRBuilderDefaultInserter<false> { 116public: 117 void SetNamePrefix(const Twine &P) {} 118}; 119 120/// \brief Provide a typedef for IRBuilder that drops names in release builds. 121#ifndef NDEBUG 122typedef llvm::IRBuilder<true, ConstantFolder, 123 IRBuilderPrefixedInserter<true> > IRBuilderTy; 124#else 125typedef llvm::IRBuilder<false, ConstantFolder, 126 IRBuilderPrefixedInserter<false> > IRBuilderTy; 127#endif 128} 129 130namespace { 131/// \brief A used slice of an alloca. 132/// 133/// This structure represents a slice of an alloca used by some instruction. It 134/// stores both the begin and end offsets of this use, a pointer to the use 135/// itself, and a flag indicating whether we can classify the use as splittable 136/// or not when forming partitions of the alloca. 137class Slice { 138 /// \brief The beginning offset of the range. 139 uint64_t BeginOffset; 140 141 /// \brief The ending offset, not included in the range. 142 uint64_t EndOffset; 143 144 /// \brief Storage for both the use of this slice and whether it can be 145 /// split. 146 PointerIntPair<Use *, 1, bool> UseAndIsSplittable; 147 148public: 149 Slice() : BeginOffset(), EndOffset() {} 150 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) 151 : BeginOffset(BeginOffset), EndOffset(EndOffset), 152 UseAndIsSplittable(U, IsSplittable) {} 153 154 uint64_t beginOffset() const { return BeginOffset; } 155 uint64_t endOffset() const { return EndOffset; } 156 157 bool isSplittable() const { return UseAndIsSplittable.getInt(); } 158 void makeUnsplittable() { UseAndIsSplittable.setInt(false); } 159 160 Use *getUse() const { return UseAndIsSplittable.getPointer(); } 161 162 bool isDead() const { return getUse() == 0; } 163 void kill() { UseAndIsSplittable.setPointer(0); } 164 165 /// \brief Support for ordering ranges. 166 /// 167 /// This provides an ordering over ranges such that start offsets are 168 /// always increasing, and within equal start offsets, the end offsets are 169 /// decreasing. Thus the spanning range comes first in a cluster with the 170 /// same start position. 171 bool operator<(const Slice &RHS) const { 172 if (beginOffset() < RHS.beginOffset()) return true; 173 if (beginOffset() > RHS.beginOffset()) return false; 174 if (isSplittable() != RHS.isSplittable()) return !isSplittable(); 175 if (endOffset() > RHS.endOffset()) return true; 176 return false; 177 } 178 179 /// \brief Support comparison with a single offset to allow binary searches. 180 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, 181 uint64_t RHSOffset) { 182 return LHS.beginOffset() < RHSOffset; 183 } 184 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 185 const Slice &RHS) { 186 return LHSOffset < RHS.beginOffset(); 187 } 188 189 bool operator==(const Slice &RHS) const { 190 return isSplittable() == RHS.isSplittable() && 191 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); 192 } 193 bool operator!=(const Slice &RHS) const { return !operator==(RHS); } 194}; 195} // end anonymous namespace 196 197namespace llvm { 198template <typename T> struct isPodLike; 199template <> struct isPodLike<Slice> { 200 static const bool value = true; 201}; 202} 203 204namespace { 205/// \brief Representation of the alloca slices. 206/// 207/// This class represents the slices of an alloca which are formed by its 208/// various uses. If a pointer escapes, we can't fully build a representation 209/// for the slices used and we reflect that in this structure. The uses are 210/// stored, sorted by increasing beginning offset and with unsplittable slices 211/// starting at a particular offset before splittable slices. 212class AllocaSlices { 213public: 214 /// \brief Construct the slices of a particular alloca. 215 AllocaSlices(const DataLayout &DL, AllocaInst &AI); 216 217 /// \brief Test whether a pointer to the allocation escapes our analysis. 218 /// 219 /// If this is true, the slices are never fully built and should be 220 /// ignored. 221 bool isEscaped() const { return PointerEscapingInstr; } 222 223 /// \brief Support for iterating over the slices. 224 /// @{ 225 typedef SmallVectorImpl<Slice>::iterator iterator; 226 iterator begin() { return Slices.begin(); } 227 iterator end() { return Slices.end(); } 228 229 typedef SmallVectorImpl<Slice>::const_iterator const_iterator; 230 const_iterator begin() const { return Slices.begin(); } 231 const_iterator end() const { return Slices.end(); } 232 /// @} 233 234 /// \brief Allow iterating the dead users for this alloca. 235 /// 236 /// These are instructions which will never actually use the alloca as they 237 /// are outside the allocated range. They are safe to replace with undef and 238 /// delete. 239 /// @{ 240 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 241 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 242 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 243 /// @} 244 245 /// \brief Allow iterating the dead expressions referring to this alloca. 246 /// 247 /// These are operands which have cannot actually be used to refer to the 248 /// alloca as they are outside its range and the user doesn't correct for 249 /// that. These mostly consist of PHI node inputs and the like which we just 250 /// need to replace with undef. 251 /// @{ 252 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 253 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 254 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 255 /// @} 256 257#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 258 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 259 void printSlice(raw_ostream &OS, const_iterator I, 260 StringRef Indent = " ") const; 261 void printUse(raw_ostream &OS, const_iterator I, 262 StringRef Indent = " ") const; 263 void print(raw_ostream &OS) const; 264 void dump(const_iterator I) const; 265 void dump() const; 266#endif 267 268private: 269 template <typename DerivedT, typename RetT = void> class BuilderBase; 270 class SliceBuilder; 271 friend class AllocaSlices::SliceBuilder; 272 273#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 274 /// \brief Handle to alloca instruction to simplify method interfaces. 275 AllocaInst &AI; 276#endif 277 278 /// \brief The instruction responsible for this alloca not having a known set 279 /// of slices. 280 /// 281 /// When an instruction (potentially) escapes the pointer to the alloca, we 282 /// store a pointer to that here and abort trying to form slices of the 283 /// alloca. This will be null if the alloca slices are analyzed successfully. 284 Instruction *PointerEscapingInstr; 285 286 /// \brief The slices of the alloca. 287 /// 288 /// We store a vector of the slices formed by uses of the alloca here. This 289 /// vector is sorted by increasing begin offset, and then the unsplittable 290 /// slices before the splittable ones. See the Slice inner class for more 291 /// details. 292 SmallVector<Slice, 8> Slices; 293 294 /// \brief Instructions which will become dead if we rewrite the alloca. 295 /// 296 /// Note that these are not separated by slice. This is because we expect an 297 /// alloca to be completely rewritten or not rewritten at all. If rewritten, 298 /// all these instructions can simply be removed and replaced with undef as 299 /// they come from outside of the allocated space. 300 SmallVector<Instruction *, 8> DeadUsers; 301 302 /// \brief Operands which will become dead if we rewrite the alloca. 303 /// 304 /// These are operands that in their particular use can be replaced with 305 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 306 /// to PHI nodes and the like. They aren't entirely dead (there might be 307 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 308 /// want to swap this particular input for undef to simplify the use lists of 309 /// the alloca. 310 SmallVector<Use *, 8> DeadOperands; 311}; 312} 313 314static Value *foldSelectInst(SelectInst &SI) { 315 // If the condition being selected on is a constant or the same value is 316 // being selected between, fold the select. Yes this does (rarely) happen 317 // early on. 318 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 319 return SI.getOperand(1+CI->isZero()); 320 if (SI.getOperand(1) == SI.getOperand(2)) 321 return SI.getOperand(1); 322 323 return 0; 324} 325 326/// \brief Builder for the alloca slices. 327/// 328/// This class builds a set of alloca slices by recursively visiting the uses 329/// of an alloca and making a slice for each load and store at each offset. 330class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { 331 friend class PtrUseVisitor<SliceBuilder>; 332 friend class InstVisitor<SliceBuilder>; 333 typedef PtrUseVisitor<SliceBuilder> Base; 334 335 const uint64_t AllocSize; 336 AllocaSlices &S; 337 338 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; 339 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; 340 341 /// \brief Set to de-duplicate dead instructions found in the use walk. 342 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 343 344public: 345 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S) 346 : PtrUseVisitor<SliceBuilder>(DL), 347 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {} 348 349private: 350 void markAsDead(Instruction &I) { 351 if (VisitedDeadInsts.insert(&I)) 352 S.DeadUsers.push_back(&I); 353 } 354 355 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 356 bool IsSplittable = false) { 357 // Completely skip uses which have a zero size or start either before or 358 // past the end of the allocation. 359 if (Size == 0 || Offset.uge(AllocSize)) { 360 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 361 << " which has zero size or starts outside of the " 362 << AllocSize << " byte alloca:\n" 363 << " alloca: " << S.AI << "\n" 364 << " use: " << I << "\n"); 365 return markAsDead(I); 366 } 367 368 uint64_t BeginOffset = Offset.getZExtValue(); 369 uint64_t EndOffset = BeginOffset + Size; 370 371 // Clamp the end offset to the end of the allocation. Note that this is 372 // formulated to handle even the case where "BeginOffset + Size" overflows. 373 // This may appear superficially to be something we could ignore entirely, 374 // but that is not so! There may be widened loads or PHI-node uses where 375 // some instructions are dead but not others. We can't completely ignore 376 // them, and so have to record at least the information here. 377 assert(AllocSize >= BeginOffset); // Established above. 378 if (Size > AllocSize - BeginOffset) { 379 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 380 << " to remain within the " << AllocSize << " byte alloca:\n" 381 << " alloca: " << S.AI << "\n" 382 << " use: " << I << "\n"); 383 EndOffset = AllocSize; 384 } 385 386 S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); 387 } 388 389 void visitBitCastInst(BitCastInst &BC) { 390 if (BC.use_empty()) 391 return markAsDead(BC); 392 393 return Base::visitBitCastInst(BC); 394 } 395 396 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 397 if (GEPI.use_empty()) 398 return markAsDead(GEPI); 399 400 if (SROAStrictInbounds && GEPI.isInBounds()) { 401 // FIXME: This is a manually un-factored variant of the basic code inside 402 // of GEPs with checking of the inbounds invariant specified in the 403 // langref in a very strict sense. If we ever want to enable 404 // SROAStrictInbounds, this code should be factored cleanly into 405 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds 406 // by writing out the code here where we have tho underlying allocation 407 // size readily available. 408 APInt GEPOffset = Offset; 409 for (gep_type_iterator GTI = gep_type_begin(GEPI), 410 GTE = gep_type_end(GEPI); 411 GTI != GTE; ++GTI) { 412 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 413 if (!OpC) 414 break; 415 416 // Handle a struct index, which adds its field offset to the pointer. 417 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 418 unsigned ElementIdx = OpC->getZExtValue(); 419 const StructLayout *SL = DL.getStructLayout(STy); 420 GEPOffset += 421 APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); 422 } else { 423 // For array or vector indices, scale the index by the size of the type. 424 APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); 425 GEPOffset += Index * APInt(Offset.getBitWidth(), 426 DL.getTypeAllocSize(GTI.getIndexedType())); 427 } 428 429 // If this index has computed an intermediate pointer which is not 430 // inbounds, then the result of the GEP is a poison value and we can 431 // delete it and all uses. 432 if (GEPOffset.ugt(AllocSize)) 433 return markAsDead(GEPI); 434 } 435 } 436 437 return Base::visitGetElementPtrInst(GEPI); 438 } 439 440 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 441 uint64_t Size, bool IsVolatile) { 442 // We allow splitting of loads and stores where the type is an integer type 443 // and cover the entire alloca. This prevents us from splitting over 444 // eagerly. 445 // FIXME: In the great blue eventually, we should eagerly split all integer 446 // loads and stores, and then have a separate step that merges adjacent 447 // alloca partitions into a single partition suitable for integer widening. 448 // Or we should skip the merge step and rely on GVN and other passes to 449 // merge adjacent loads and stores that survive mem2reg. 450 bool IsSplittable = 451 Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; 452 453 insertUse(I, Offset, Size, IsSplittable); 454 } 455 456 void visitLoadInst(LoadInst &LI) { 457 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 458 "All simple FCA loads should have been pre-split"); 459 460 if (!IsOffsetKnown) 461 return PI.setAborted(&LI); 462 463 uint64_t Size = DL.getTypeStoreSize(LI.getType()); 464 return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); 465 } 466 467 void visitStoreInst(StoreInst &SI) { 468 Value *ValOp = SI.getValueOperand(); 469 if (ValOp == *U) 470 return PI.setEscapedAndAborted(&SI); 471 if (!IsOffsetKnown) 472 return PI.setAborted(&SI); 473 474 uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); 475 476 // If this memory access can be shown to *statically* extend outside the 477 // bounds of of the allocation, it's behavior is undefined, so simply 478 // ignore it. Note that this is more strict than the generic clamping 479 // behavior of insertUse. We also try to handle cases which might run the 480 // risk of overflow. 481 // FIXME: We should instead consider the pointer to have escaped if this 482 // function is being instrumented for addressing bugs or race conditions. 483 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { 484 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset 485 << " which extends past the end of the " << AllocSize 486 << " byte alloca:\n" 487 << " alloca: " << S.AI << "\n" 488 << " use: " << SI << "\n"); 489 return markAsDead(SI); 490 } 491 492 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 493 "All simple FCA stores should have been pre-split"); 494 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 495 } 496 497 498 void visitMemSetInst(MemSetInst &II) { 499 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 500 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 501 if ((Length && Length->getValue() == 0) || 502 (IsOffsetKnown && Offset.uge(AllocSize))) 503 // Zero-length mem transfer intrinsics can be ignored entirely. 504 return markAsDead(II); 505 506 if (!IsOffsetKnown) 507 return PI.setAborted(&II); 508 509 insertUse(II, Offset, 510 Length ? Length->getLimitedValue() 511 : AllocSize - Offset.getLimitedValue(), 512 (bool)Length); 513 } 514 515 void visitMemTransferInst(MemTransferInst &II) { 516 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 517 if (Length && Length->getValue() == 0) 518 // Zero-length mem transfer intrinsics can be ignored entirely. 519 return markAsDead(II); 520 521 // Because we can visit these intrinsics twice, also check to see if the 522 // first time marked this instruction as dead. If so, skip it. 523 if (VisitedDeadInsts.count(&II)) 524 return; 525 526 if (!IsOffsetKnown) 527 return PI.setAborted(&II); 528 529 // This side of the transfer is completely out-of-bounds, and so we can 530 // nuke the entire transfer. However, we also need to nuke the other side 531 // if already added to our partitions. 532 // FIXME: Yet another place we really should bypass this when 533 // instrumenting for ASan. 534 if (Offset.uge(AllocSize)) { 535 SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); 536 if (MTPI != MemTransferSliceMap.end()) 537 S.Slices[MTPI->second].kill(); 538 return markAsDead(II); 539 } 540 541 uint64_t RawOffset = Offset.getLimitedValue(); 542 uint64_t Size = Length ? Length->getLimitedValue() 543 : AllocSize - RawOffset; 544 545 // Check for the special case where the same exact value is used for both 546 // source and dest. 547 if (*U == II.getRawDest() && *U == II.getRawSource()) { 548 // For non-volatile transfers this is a no-op. 549 if (!II.isVolatile()) 550 return markAsDead(II); 551 552 return insertUse(II, Offset, Size, /*IsSplittable=*/false); 553 } 554 555 // If we have seen both source and destination for a mem transfer, then 556 // they both point to the same alloca. 557 bool Inserted; 558 SmallDenseMap<Instruction *, unsigned>::iterator MTPI; 559 std::tie(MTPI, Inserted) = 560 MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); 561 unsigned PrevIdx = MTPI->second; 562 if (!Inserted) { 563 Slice &PrevP = S.Slices[PrevIdx]; 564 565 // Check if the begin offsets match and this is a non-volatile transfer. 566 // In that case, we can completely elide the transfer. 567 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { 568 PrevP.kill(); 569 return markAsDead(II); 570 } 571 572 // Otherwise we have an offset transfer within the same alloca. We can't 573 // split those. 574 PrevP.makeUnsplittable(); 575 } 576 577 // Insert the use now that we've fixed up the splittable nature. 578 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); 579 580 // Check that we ended up with a valid index in the map. 581 assert(S.Slices[PrevIdx].getUse()->getUser() == &II && 582 "Map index doesn't point back to a slice with this user."); 583 } 584 585 // Disable SRoA for any intrinsics except for lifetime invariants. 586 // FIXME: What about debug intrinsics? This matches old behavior, but 587 // doesn't make sense. 588 void visitIntrinsicInst(IntrinsicInst &II) { 589 if (!IsOffsetKnown) 590 return PI.setAborted(&II); 591 592 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 593 II.getIntrinsicID() == Intrinsic::lifetime_end) { 594 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 595 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 596 Length->getLimitedValue()); 597 insertUse(II, Offset, Size, true); 598 return; 599 } 600 601 Base::visitIntrinsicInst(II); 602 } 603 604 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 605 // We consider any PHI or select that results in a direct load or store of 606 // the same offset to be a viable use for slicing purposes. These uses 607 // are considered unsplittable and the size is the maximum loaded or stored 608 // size. 609 SmallPtrSet<Instruction *, 4> Visited; 610 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 611 Visited.insert(Root); 612 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 613 // If there are no loads or stores, the access is dead. We mark that as 614 // a size zero access. 615 Size = 0; 616 do { 617 Instruction *I, *UsedI; 618 std::tie(UsedI, I) = Uses.pop_back_val(); 619 620 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 621 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 622 continue; 623 } 624 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 625 Value *Op = SI->getOperand(0); 626 if (Op == UsedI) 627 return SI; 628 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 629 continue; 630 } 631 632 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 633 if (!GEP->hasAllZeroIndices()) 634 return GEP; 635 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 636 !isa<SelectInst>(I)) { 637 return I; 638 } 639 640 for (User *U : I->users()) 641 if (Visited.insert(cast<Instruction>(U))) 642 Uses.push_back(std::make_pair(I, cast<Instruction>(U))); 643 } while (!Uses.empty()); 644 645 return 0; 646 } 647 648 void visitPHINode(PHINode &PN) { 649 if (PN.use_empty()) 650 return markAsDead(PN); 651 if (!IsOffsetKnown) 652 return PI.setAborted(&PN); 653 654 // See if we already have computed info on this node. 655 uint64_t &PHISize = PHIOrSelectSizes[&PN]; 656 if (!PHISize) { 657 // This is a new PHI node, check for an unsafe use of the PHI node. 658 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize)) 659 return PI.setAborted(UnsafeI); 660 } 661 662 // For PHI and select operands outside the alloca, we can't nuke the entire 663 // phi or select -- the other side might still be relevant, so we special 664 // case them here and use a separate structure to track the operands 665 // themselves which should be replaced with undef. 666 // FIXME: This should instead be escaped in the event we're instrumenting 667 // for address sanitization. 668 if (Offset.uge(AllocSize)) { 669 S.DeadOperands.push_back(U); 670 return; 671 } 672 673 insertUse(PN, Offset, PHISize); 674 } 675 676 void visitSelectInst(SelectInst &SI) { 677 if (SI.use_empty()) 678 return markAsDead(SI); 679 if (Value *Result = foldSelectInst(SI)) { 680 if (Result == *U) 681 // If the result of the constant fold will be the pointer, recurse 682 // through the select as if we had RAUW'ed it. 683 enqueueUsers(SI); 684 else 685 // Otherwise the operand to the select is dead, and we can replace it 686 // with undef. 687 S.DeadOperands.push_back(U); 688 689 return; 690 } 691 if (!IsOffsetKnown) 692 return PI.setAborted(&SI); 693 694 // See if we already have computed info on this node. 695 uint64_t &SelectSize = PHIOrSelectSizes[&SI]; 696 if (!SelectSize) { 697 // This is a new Select, check for an unsafe use of it. 698 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize)) 699 return PI.setAborted(UnsafeI); 700 } 701 702 // For PHI and select operands outside the alloca, we can't nuke the entire 703 // phi or select -- the other side might still be relevant, so we special 704 // case them here and use a separate structure to track the operands 705 // themselves which should be replaced with undef. 706 // FIXME: This should instead be escaped in the event we're instrumenting 707 // for address sanitization. 708 if (Offset.uge(AllocSize)) { 709 S.DeadOperands.push_back(U); 710 return; 711 } 712 713 insertUse(SI, Offset, SelectSize); 714 } 715 716 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 717 void visitInstruction(Instruction &I) { 718 PI.setAborted(&I); 719 } 720}; 721 722AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) 723 : 724#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 725 AI(AI), 726#endif 727 PointerEscapingInstr(0) { 728 SliceBuilder PB(DL, AI, *this); 729 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); 730 if (PtrI.isEscaped() || PtrI.isAborted()) { 731 // FIXME: We should sink the escape vs. abort info into the caller nicely, 732 // possibly by just storing the PtrInfo in the AllocaSlices. 733 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 734 : PtrI.getAbortingInst(); 735 assert(PointerEscapingInstr && "Did not track a bad instruction"); 736 return; 737 } 738 739 Slices.erase(std::remove_if(Slices.begin(), Slices.end(), 740 std::mem_fun_ref(&Slice::isDead)), 741 Slices.end()); 742 743#if __cplusplus >= 201103L && !defined(NDEBUG) 744 if (SROARandomShuffleSlices) { 745 std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec())); 746 std::shuffle(Slices.begin(), Slices.end(), MT); 747 } 748#endif 749 750 // Sort the uses. This arranges for the offsets to be in ascending order, 751 // and the sizes to be in descending order. 752 std::sort(Slices.begin(), Slices.end()); 753} 754 755#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 756 757void AllocaSlices::print(raw_ostream &OS, const_iterator I, 758 StringRef Indent) const { 759 printSlice(OS, I, Indent); 760 printUse(OS, I, Indent); 761} 762 763void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, 764 StringRef Indent) const { 765 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" 766 << " slice #" << (I - begin()) 767 << (I->isSplittable() ? " (splittable)" : "") << "\n"; 768} 769 770void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, 771 StringRef Indent) const { 772 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; 773} 774 775void AllocaSlices::print(raw_ostream &OS) const { 776 if (PointerEscapingInstr) { 777 OS << "Can't analyze slices for alloca: " << AI << "\n" 778 << " A pointer to this alloca escaped by:\n" 779 << " " << *PointerEscapingInstr << "\n"; 780 return; 781 } 782 783 OS << "Slices of alloca: " << AI << "\n"; 784 for (const_iterator I = begin(), E = end(); I != E; ++I) 785 print(OS, I); 786} 787 788LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { 789 print(dbgs(), I); 790} 791LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } 792 793#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 794 795namespace { 796/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 797/// 798/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 799/// the loads and stores of an alloca instruction, as well as updating its 800/// debug information. This is used when a domtree is unavailable and thus 801/// mem2reg in its full form can't be used to handle promotion of allocas to 802/// scalar values. 803class AllocaPromoter : public LoadAndStorePromoter { 804 AllocaInst &AI; 805 DIBuilder &DIB; 806 807 SmallVector<DbgDeclareInst *, 4> DDIs; 808 SmallVector<DbgValueInst *, 4> DVIs; 809 810public: 811 AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, 812 AllocaInst &AI, DIBuilder &DIB) 813 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 814 815 void run(const SmallVectorImpl<Instruction*> &Insts) { 816 // Retain the debug information attached to the alloca for use when 817 // rewriting loads and stores. 818 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 819 for (User *U : DebugNode->users()) 820 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) 821 DDIs.push_back(DDI); 822 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) 823 DVIs.push_back(DVI); 824 } 825 826 LoadAndStorePromoter::run(Insts); 827 828 // While we have the debug information, clear it off of the alloca. The 829 // caller takes care of deleting the alloca. 830 while (!DDIs.empty()) 831 DDIs.pop_back_val()->eraseFromParent(); 832 while (!DVIs.empty()) 833 DVIs.pop_back_val()->eraseFromParent(); 834 } 835 836 bool isInstInList(Instruction *I, 837 const SmallVectorImpl<Instruction*> &Insts) const override { 838 Value *Ptr; 839 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 840 Ptr = LI->getOperand(0); 841 else 842 Ptr = cast<StoreInst>(I)->getPointerOperand(); 843 844 // Only used to detect cycles, which will be rare and quickly found as 845 // we're walking up a chain of defs rather than down through uses. 846 SmallPtrSet<Value *, 4> Visited; 847 848 do { 849 if (Ptr == &AI) 850 return true; 851 852 if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) 853 Ptr = BCI->getOperand(0); 854 else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) 855 Ptr = GEPI->getPointerOperand(); 856 else 857 return false; 858 859 } while (Visited.insert(Ptr)); 860 861 return false; 862 } 863 864 void updateDebugInfo(Instruction *Inst) const override { 865 for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), 866 E = DDIs.end(); I != E; ++I) { 867 DbgDeclareInst *DDI = *I; 868 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 869 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 870 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 871 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 872 } 873 for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), 874 E = DVIs.end(); I != E; ++I) { 875 DbgValueInst *DVI = *I; 876 Value *Arg = 0; 877 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 878 // If an argument is zero extended then use argument directly. The ZExt 879 // may be zapped by an optimization pass in future. 880 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 881 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 882 else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 883 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 884 if (!Arg) 885 Arg = SI->getValueOperand(); 886 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 887 Arg = LI->getPointerOperand(); 888 } else { 889 continue; 890 } 891 Instruction *DbgVal = 892 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 893 Inst); 894 DbgVal->setDebugLoc(DVI->getDebugLoc()); 895 } 896 } 897}; 898} // end anon namespace 899 900 901namespace { 902/// \brief An optimization pass providing Scalar Replacement of Aggregates. 903/// 904/// This pass takes allocations which can be completely analyzed (that is, they 905/// don't escape) and tries to turn them into scalar SSA values. There are 906/// a few steps to this process. 907/// 908/// 1) It takes allocations of aggregates and analyzes the ways in which they 909/// are used to try to split them into smaller allocations, ideally of 910/// a single scalar data type. It will split up memcpy and memset accesses 911/// as necessary and try to isolate individual scalar accesses. 912/// 2) It will transform accesses into forms which are suitable for SSA value 913/// promotion. This can be replacing a memset with a scalar store of an 914/// integer value, or it can involve speculating operations on a PHI or 915/// select to be a PHI or select of the results. 916/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 917/// onto insert and extract operations on a vector value, and convert them to 918/// this form. By doing so, it will enable promotion of vector aggregates to 919/// SSA vector values. 920class SROA : public FunctionPass { 921 const bool RequiresDomTree; 922 923 LLVMContext *C; 924 const DataLayout *DL; 925 DominatorTree *DT; 926 927 /// \brief Worklist of alloca instructions to simplify. 928 /// 929 /// Each alloca in the function is added to this. Each new alloca formed gets 930 /// added to it as well to recursively simplify unless that alloca can be 931 /// directly promoted. Finally, each time we rewrite a use of an alloca other 932 /// the one being actively rewritten, we add it back onto the list if not 933 /// already present to ensure it is re-visited. 934 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 935 936 /// \brief A collection of instructions to delete. 937 /// We try to batch deletions to simplify code and make things a bit more 938 /// efficient. 939 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 940 941 /// \brief Post-promotion worklist. 942 /// 943 /// Sometimes we discover an alloca which has a high probability of becoming 944 /// viable for SROA after a round of promotion takes place. In those cases, 945 /// the alloca is enqueued here for re-processing. 946 /// 947 /// Note that we have to be very careful to clear allocas out of this list in 948 /// the event they are deleted. 949 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 950 951 /// \brief A collection of alloca instructions we can directly promote. 952 std::vector<AllocaInst *> PromotableAllocas; 953 954 /// \brief A worklist of PHIs to speculate prior to promoting allocas. 955 /// 956 /// All of these PHIs have been checked for the safety of speculation and by 957 /// being speculated will allow promoting allocas currently in the promotable 958 /// queue. 959 SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; 960 961 /// \brief A worklist of select instructions to speculate prior to promoting 962 /// allocas. 963 /// 964 /// All of these select instructions have been checked for the safety of 965 /// speculation and by being speculated will allow promoting allocas 966 /// currently in the promotable queue. 967 SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; 968 969public: 970 SROA(bool RequiresDomTree = true) 971 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 972 C(0), DL(0), DT(0) { 973 initializeSROAPass(*PassRegistry::getPassRegistry()); 974 } 975 bool runOnFunction(Function &F) override; 976 void getAnalysisUsage(AnalysisUsage &AU) const override; 977 978 const char *getPassName() const override { return "SROA"; } 979 static char ID; 980 981private: 982 friend class PHIOrSelectSpeculator; 983 friend class AllocaSliceRewriter; 984 985 bool rewritePartition(AllocaInst &AI, AllocaSlices &S, 986 AllocaSlices::iterator B, AllocaSlices::iterator E, 987 int64_t BeginOffset, int64_t EndOffset, 988 ArrayRef<AllocaSlices::iterator> SplitUses); 989 bool splitAlloca(AllocaInst &AI, AllocaSlices &S); 990 bool runOnAlloca(AllocaInst &AI); 991 void clobberUse(Use &U); 992 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 993 bool promoteAllocas(Function &F); 994}; 995} 996 997char SROA::ID = 0; 998 999FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1000 return new SROA(RequiresDomTree); 1001} 1002 1003INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1004 false, false) 1005INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1006INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1007 false, false) 1008 1009/// Walk the range of a partitioning looking for a common type to cover this 1010/// sequence of slices. 1011static Type *findCommonType(AllocaSlices::const_iterator B, 1012 AllocaSlices::const_iterator E, 1013 uint64_t EndOffset) { 1014 Type *Ty = 0; 1015 bool TyIsCommon = true; 1016 IntegerType *ITy = 0; 1017 1018 // Note that we need to look at *every* alloca slice's Use to ensure we 1019 // always get consistent results regardless of the order of slices. 1020 for (AllocaSlices::const_iterator I = B; I != E; ++I) { 1021 Use *U = I->getUse(); 1022 if (isa<IntrinsicInst>(*U->getUser())) 1023 continue; 1024 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) 1025 continue; 1026 1027 Type *UserTy = 0; 1028 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 1029 UserTy = LI->getType(); 1030 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 1031 UserTy = SI->getValueOperand()->getType(); 1032 } 1033 1034 if (!UserTy || (Ty && Ty != UserTy)) 1035 TyIsCommon = false; // Give up on anything but an iN type. 1036 else 1037 Ty = UserTy; 1038 1039 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { 1040 // If the type is larger than the partition, skip it. We only encounter 1041 // this for split integer operations where we want to use the type of the 1042 // entity causing the split. Also skip if the type is not a byte width 1043 // multiple. 1044 if (UserITy->getBitWidth() % 8 != 0 || 1045 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) 1046 continue; 1047 1048 // Track the largest bitwidth integer type used in this way in case there 1049 // is no common type. 1050 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) 1051 ITy = UserITy; 1052 } 1053 } 1054 1055 return TyIsCommon ? Ty : ITy; 1056} 1057 1058/// PHI instructions that use an alloca and are subsequently loaded can be 1059/// rewritten to load both input pointers in the pred blocks and then PHI the 1060/// results, allowing the load of the alloca to be promoted. 1061/// From this: 1062/// %P2 = phi [i32* %Alloca, i32* %Other] 1063/// %V = load i32* %P2 1064/// to: 1065/// %V1 = load i32* %Alloca -> will be mem2reg'd 1066/// ... 1067/// %V2 = load i32* %Other 1068/// ... 1069/// %V = phi [i32 %V1, i32 %V2] 1070/// 1071/// We can do this to a select if its only uses are loads and if the operands 1072/// to the select can be loaded unconditionally. 1073/// 1074/// FIXME: This should be hoisted into a generic utility, likely in 1075/// Transforms/Util/Local.h 1076static bool isSafePHIToSpeculate(PHINode &PN, 1077 const DataLayout *DL = 0) { 1078 // For now, we can only do this promotion if the load is in the same block 1079 // as the PHI, and if there are no stores between the phi and load. 1080 // TODO: Allow recursive phi users. 1081 // TODO: Allow stores. 1082 BasicBlock *BB = PN.getParent(); 1083 unsigned MaxAlign = 0; 1084 bool HaveLoad = false; 1085 for (User *U : PN.users()) { 1086 LoadInst *LI = dyn_cast<LoadInst>(U); 1087 if (LI == 0 || !LI->isSimple()) 1088 return false; 1089 1090 // For now we only allow loads in the same block as the PHI. This is 1091 // a common case that happens when instcombine merges two loads through 1092 // a PHI. 1093 if (LI->getParent() != BB) 1094 return false; 1095 1096 // Ensure that there are no instructions between the PHI and the load that 1097 // could store. 1098 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1099 if (BBI->mayWriteToMemory()) 1100 return false; 1101 1102 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1103 HaveLoad = true; 1104 } 1105 1106 if (!HaveLoad) 1107 return false; 1108 1109 // We can only transform this if it is safe to push the loads into the 1110 // predecessor blocks. The only thing to watch out for is that we can't put 1111 // a possibly trapping load in the predecessor if it is a critical edge. 1112 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1113 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1114 Value *InVal = PN.getIncomingValue(Idx); 1115 1116 // If the value is produced by the terminator of the predecessor (an 1117 // invoke) or it has side-effects, there is no valid place to put a load 1118 // in the predecessor. 1119 if (TI == InVal || TI->mayHaveSideEffects()) 1120 return false; 1121 1122 // If the predecessor has a single successor, then the edge isn't 1123 // critical. 1124 if (TI->getNumSuccessors() == 1) 1125 continue; 1126 1127 // If this pointer is always safe to load, or if we can prove that there 1128 // is already a load in the block, then we can move the load to the pred 1129 // block. 1130 if (InVal->isDereferenceablePointer() || 1131 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) 1132 continue; 1133 1134 return false; 1135 } 1136 1137 return true; 1138} 1139 1140static void speculatePHINodeLoads(PHINode &PN) { 1141 DEBUG(dbgs() << " original: " << PN << "\n"); 1142 1143 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1144 IRBuilderTy PHIBuilder(&PN); 1145 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1146 PN.getName() + ".sroa.speculated"); 1147 1148 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1149 // matter which one we get and if any differ. 1150 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); 1151 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1152 unsigned Align = SomeLoad->getAlignment(); 1153 1154 // Rewrite all loads of the PN to use the new PHI. 1155 while (!PN.use_empty()) { 1156 LoadInst *LI = cast<LoadInst>(PN.user_back()); 1157 LI->replaceAllUsesWith(NewPN); 1158 LI->eraseFromParent(); 1159 } 1160 1161 // Inject loads into all of the pred blocks. 1162 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1163 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1164 TerminatorInst *TI = Pred->getTerminator(); 1165 Value *InVal = PN.getIncomingValue(Idx); 1166 IRBuilderTy PredBuilder(TI); 1167 1168 LoadInst *Load = PredBuilder.CreateLoad( 1169 InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); 1170 ++NumLoadsSpeculated; 1171 Load->setAlignment(Align); 1172 if (TBAATag) 1173 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1174 NewPN->addIncoming(Load, Pred); 1175 } 1176 1177 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1178 PN.eraseFromParent(); 1179} 1180 1181/// Select instructions that use an alloca and are subsequently loaded can be 1182/// rewritten to load both input pointers and then select between the result, 1183/// allowing the load of the alloca to be promoted. 1184/// From this: 1185/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1186/// %V = load i32* %P2 1187/// to: 1188/// %V1 = load i32* %Alloca -> will be mem2reg'd 1189/// %V2 = load i32* %Other 1190/// %V = select i1 %cond, i32 %V1, i32 %V2 1191/// 1192/// We can do this to a select if its only uses are loads and if the operand 1193/// to the select can be loaded unconditionally. 1194static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) { 1195 Value *TValue = SI.getTrueValue(); 1196 Value *FValue = SI.getFalseValue(); 1197 bool TDerefable = TValue->isDereferenceablePointer(); 1198 bool FDerefable = FValue->isDereferenceablePointer(); 1199 1200 for (User *U : SI.users()) { 1201 LoadInst *LI = dyn_cast<LoadInst>(U); 1202 if (LI == 0 || !LI->isSimple()) 1203 return false; 1204 1205 // Both operands to the select need to be dereferencable, either 1206 // absolutely (e.g. allocas) or at this point because we can see other 1207 // accesses to it. 1208 if (!TDerefable && 1209 !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) 1210 return false; 1211 if (!FDerefable && 1212 !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) 1213 return false; 1214 } 1215 1216 return true; 1217} 1218 1219static void speculateSelectInstLoads(SelectInst &SI) { 1220 DEBUG(dbgs() << " original: " << SI << "\n"); 1221 1222 IRBuilderTy IRB(&SI); 1223 Value *TV = SI.getTrueValue(); 1224 Value *FV = SI.getFalseValue(); 1225 // Replace the loads of the select with a select of two loads. 1226 while (!SI.use_empty()) { 1227 LoadInst *LI = cast<LoadInst>(SI.user_back()); 1228 assert(LI->isSimple() && "We only speculate simple loads"); 1229 1230 IRB.SetInsertPoint(LI); 1231 LoadInst *TL = 1232 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1233 LoadInst *FL = 1234 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1235 NumLoadsSpeculated += 2; 1236 1237 // Transfer alignment and TBAA info if present. 1238 TL->setAlignment(LI->getAlignment()); 1239 FL->setAlignment(LI->getAlignment()); 1240 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1241 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1242 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1243 } 1244 1245 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1246 LI->getName() + ".sroa.speculated"); 1247 1248 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1249 LI->replaceAllUsesWith(V); 1250 LI->eraseFromParent(); 1251 } 1252 SI.eraseFromParent(); 1253} 1254 1255/// \brief Build a GEP out of a base pointer and indices. 1256/// 1257/// This will return the BasePtr if that is valid, or build a new GEP 1258/// instruction using the IRBuilder if GEP-ing is needed. 1259static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, 1260 SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { 1261 if (Indices.empty()) 1262 return BasePtr; 1263 1264 // A single zero index is a no-op, so check for this and avoid building a GEP 1265 // in that case. 1266 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1267 return BasePtr; 1268 1269 return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); 1270} 1271 1272/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1273/// TargetTy without changing the offset of the pointer. 1274/// 1275/// This routine assumes we've already established a properly offset GEP with 1276/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1277/// zero-indices down through type layers until we find one the same as 1278/// TargetTy. If we can't find one with the same type, we at least try to use 1279/// one with the same size. If none of that works, we just produce the GEP as 1280/// indicated by Indices to have the correct offset. 1281static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, 1282 Value *BasePtr, Type *Ty, Type *TargetTy, 1283 SmallVectorImpl<Value *> &Indices, 1284 Twine NamePrefix) { 1285 if (Ty == TargetTy) 1286 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 1287 1288 // Pointer size to use for the indices. 1289 unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType()); 1290 1291 // See if we can descend into a struct and locate a field with the correct 1292 // type. 1293 unsigned NumLayers = 0; 1294 Type *ElementTy = Ty; 1295 do { 1296 if (ElementTy->isPointerTy()) 1297 break; 1298 1299 if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { 1300 ElementTy = ArrayTy->getElementType(); 1301 Indices.push_back(IRB.getIntN(PtrSize, 0)); 1302 } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { 1303 ElementTy = VectorTy->getElementType(); 1304 Indices.push_back(IRB.getInt32(0)); 1305 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1306 if (STy->element_begin() == STy->element_end()) 1307 break; // Nothing left to descend into. 1308 ElementTy = *STy->element_begin(); 1309 Indices.push_back(IRB.getInt32(0)); 1310 } else { 1311 break; 1312 } 1313 ++NumLayers; 1314 } while (ElementTy != TargetTy); 1315 if (ElementTy != TargetTy) 1316 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1317 1318 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 1319} 1320 1321/// \brief Recursively compute indices for a natural GEP. 1322/// 1323/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1324/// element types adding appropriate indices for the GEP. 1325static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, 1326 Value *Ptr, Type *Ty, APInt &Offset, 1327 Type *TargetTy, 1328 SmallVectorImpl<Value *> &Indices, 1329 Twine NamePrefix) { 1330 if (Offset == 0) 1331 return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); 1332 1333 // We can't recurse through pointer types. 1334 if (Ty->isPointerTy()) 1335 return 0; 1336 1337 // We try to analyze GEPs over vectors here, but note that these GEPs are 1338 // extremely poorly defined currently. The long-term goal is to remove GEPing 1339 // over a vector from the IR completely. 1340 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1341 unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); 1342 if (ElementSizeInBits % 8) 1343 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1344 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1345 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1346 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1347 return 0; 1348 Offset -= NumSkippedElements * ElementSize; 1349 Indices.push_back(IRB.getInt(NumSkippedElements)); 1350 return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), 1351 Offset, TargetTy, Indices, NamePrefix); 1352 } 1353 1354 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1355 Type *ElementTy = ArrTy->getElementType(); 1356 APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); 1357 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1358 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1359 return 0; 1360 1361 Offset -= NumSkippedElements * ElementSize; 1362 Indices.push_back(IRB.getInt(NumSkippedElements)); 1363 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 1364 Indices, NamePrefix); 1365 } 1366 1367 StructType *STy = dyn_cast<StructType>(Ty); 1368 if (!STy) 1369 return 0; 1370 1371 const StructLayout *SL = DL.getStructLayout(STy); 1372 uint64_t StructOffset = Offset.getZExtValue(); 1373 if (StructOffset >= SL->getSizeInBytes()) 1374 return 0; 1375 unsigned Index = SL->getElementContainingOffset(StructOffset); 1376 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1377 Type *ElementTy = STy->getElementType(Index); 1378 if (Offset.uge(DL.getTypeAllocSize(ElementTy))) 1379 return 0; // The offset points into alignment padding. 1380 1381 Indices.push_back(IRB.getInt32(Index)); 1382 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 1383 Indices, NamePrefix); 1384} 1385 1386/// \brief Get a natural GEP from a base pointer to a particular offset and 1387/// resulting in a particular type. 1388/// 1389/// The goal is to produce a "natural" looking GEP that works with the existing 1390/// composite types to arrive at the appropriate offset and element type for 1391/// a pointer. TargetTy is the element type the returned GEP should point-to if 1392/// possible. We recurse by decreasing Offset, adding the appropriate index to 1393/// Indices, and setting Ty to the result subtype. 1394/// 1395/// If no natural GEP can be constructed, this function returns null. 1396static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, 1397 Value *Ptr, APInt Offset, Type *TargetTy, 1398 SmallVectorImpl<Value *> &Indices, 1399 Twine NamePrefix) { 1400 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1401 1402 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1403 // an i8. 1404 if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) 1405 return 0; 1406 1407 Type *ElementTy = Ty->getElementType(); 1408 if (!ElementTy->isSized()) 1409 return 0; // We can't GEP through an unsized element. 1410 APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); 1411 if (ElementSize == 0) 1412 return 0; // Zero-length arrays can't help us build a natural GEP. 1413 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1414 1415 Offset -= NumSkippedElements * ElementSize; 1416 Indices.push_back(IRB.getInt(NumSkippedElements)); 1417 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 1418 Indices, NamePrefix); 1419} 1420 1421/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1422/// resulting pointer has PointerTy. 1423/// 1424/// This tries very hard to compute a "natural" GEP which arrives at the offset 1425/// and produces the pointer type desired. Where it cannot, it will try to use 1426/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1427/// fails, it will try to use an existing i8* and GEP to the byte offset and 1428/// bitcast to the type. 1429/// 1430/// The strategy for finding the more natural GEPs is to peel off layers of the 1431/// pointer, walking back through bit casts and GEPs, searching for a base 1432/// pointer from which we can compute a natural GEP with the desired 1433/// properties. The algorithm tries to fold as many constant indices into 1434/// a single GEP as possible, thus making each GEP more independent of the 1435/// surrounding code. 1436static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, 1437 APInt Offset, Type *PointerTy, 1438 Twine NamePrefix) { 1439 // Even though we don't look through PHI nodes, we could be called on an 1440 // instruction in an unreachable block, which may be on a cycle. 1441 SmallPtrSet<Value *, 4> Visited; 1442 Visited.insert(Ptr); 1443 SmallVector<Value *, 4> Indices; 1444 1445 // We may end up computing an offset pointer that has the wrong type. If we 1446 // never are able to compute one directly that has the correct type, we'll 1447 // fall back to it, so keep it around here. 1448 Value *OffsetPtr = 0; 1449 1450 // Remember any i8 pointer we come across to re-use if we need to do a raw 1451 // byte offset. 1452 Value *Int8Ptr = 0; 1453 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1454 1455 Type *TargetTy = PointerTy->getPointerElementType(); 1456 1457 do { 1458 // First fold any existing GEPs into the offset. 1459 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1460 APInt GEPOffset(Offset.getBitWidth(), 0); 1461 if (!GEP->accumulateConstantOffset(DL, GEPOffset)) 1462 break; 1463 Offset += GEPOffset; 1464 Ptr = GEP->getPointerOperand(); 1465 if (!Visited.insert(Ptr)) 1466 break; 1467 } 1468 1469 // See if we can perform a natural GEP here. 1470 Indices.clear(); 1471 if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, 1472 Indices, NamePrefix)) { 1473 if (P->getType() == PointerTy) { 1474 // Zap any offset pointer that we ended up computing in previous rounds. 1475 if (OffsetPtr && OffsetPtr->use_empty()) 1476 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1477 I->eraseFromParent(); 1478 return P; 1479 } 1480 if (!OffsetPtr) { 1481 OffsetPtr = P; 1482 } 1483 } 1484 1485 // Stash this pointer if we've found an i8*. 1486 if (Ptr->getType()->isIntegerTy(8)) { 1487 Int8Ptr = Ptr; 1488 Int8PtrOffset = Offset; 1489 } 1490 1491 // Peel off a layer of the pointer and update the offset appropriately. 1492 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1493 Ptr = cast<Operator>(Ptr)->getOperand(0); 1494 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1495 if (GA->mayBeOverridden()) 1496 break; 1497 Ptr = GA->getAliasee(); 1498 } else { 1499 break; 1500 } 1501 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1502 } while (Visited.insert(Ptr)); 1503 1504 if (!OffsetPtr) { 1505 if (!Int8Ptr) { 1506 Int8Ptr = IRB.CreateBitCast( 1507 Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), 1508 NamePrefix + "sroa_raw_cast"); 1509 Int8PtrOffset = Offset; 1510 } 1511 1512 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1513 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1514 NamePrefix + "sroa_raw_idx"); 1515 } 1516 Ptr = OffsetPtr; 1517 1518 // On the off chance we were targeting i8*, guard the bitcast here. 1519 if (Ptr->getType() != PointerTy) 1520 Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast"); 1521 1522 return Ptr; 1523} 1524 1525/// \brief Test whether we can convert a value from the old to the new type. 1526/// 1527/// This predicate should be used to guard calls to convertValue in order to 1528/// ensure that we only try to convert viable values. The strategy is that we 1529/// will peel off single element struct and array wrappings to get to an 1530/// underlying value, and convert that value. 1531static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1532 if (OldTy == NewTy) 1533 return true; 1534 if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) 1535 if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) 1536 if (NewITy->getBitWidth() >= OldITy->getBitWidth()) 1537 return true; 1538 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 1539 return false; 1540 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1541 return false; 1542 1543 // We can convert pointers to integers and vice-versa. Same for vectors 1544 // of pointers and integers. 1545 OldTy = OldTy->getScalarType(); 1546 NewTy = NewTy->getScalarType(); 1547 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1548 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 1549 return true; 1550 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 1551 return true; 1552 return false; 1553 } 1554 1555 return true; 1556} 1557 1558/// \brief Generic routine to convert an SSA value to a value of a different 1559/// type. 1560/// 1561/// This will try various different casting techniques, such as bitcasts, 1562/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1563/// two types for viability with this routine. 1564static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 1565 Type *NewTy) { 1566 Type *OldTy = V->getType(); 1567 assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); 1568 1569 if (OldTy == NewTy) 1570 return V; 1571 1572 if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) 1573 if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) 1574 if (NewITy->getBitWidth() > OldITy->getBitWidth()) 1575 return IRB.CreateZExt(V, NewITy); 1576 1577 // See if we need inttoptr for this type pair. A cast involving both scalars 1578 // and vectors requires and additional bitcast. 1579 if (OldTy->getScalarType()->isIntegerTy() && 1580 NewTy->getScalarType()->isPointerTy()) { 1581 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* 1582 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) 1583 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 1584 NewTy); 1585 1586 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> 1587 if (!OldTy->isVectorTy() && NewTy->isVectorTy()) 1588 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 1589 NewTy); 1590 1591 return IRB.CreateIntToPtr(V, NewTy); 1592 } 1593 1594 // See if we need ptrtoint for this type pair. A cast involving both scalars 1595 // and vectors requires and additional bitcast. 1596 if (OldTy->getScalarType()->isPointerTy() && 1597 NewTy->getScalarType()->isIntegerTy()) { 1598 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 1599 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) 1600 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 1601 NewTy); 1602 1603 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> 1604 if (!OldTy->isVectorTy() && NewTy->isVectorTy()) 1605 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 1606 NewTy); 1607 1608 return IRB.CreatePtrToInt(V, NewTy); 1609 } 1610 1611 return IRB.CreateBitCast(V, NewTy); 1612} 1613 1614/// \brief Test whether the given slice use can be promoted to a vector. 1615/// 1616/// This function is called to test each entry in a partioning which is slated 1617/// for a single slice. 1618static bool isVectorPromotionViableForSlice( 1619 const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset, 1620 uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize, 1621 AllocaSlices::const_iterator I) { 1622 // First validate the slice offsets. 1623 uint64_t BeginOffset = 1624 std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset; 1625 uint64_t BeginIndex = BeginOffset / ElementSize; 1626 if (BeginIndex * ElementSize != BeginOffset || 1627 BeginIndex >= Ty->getNumElements()) 1628 return false; 1629 uint64_t EndOffset = 1630 std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset; 1631 uint64_t EndIndex = EndOffset / ElementSize; 1632 if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) 1633 return false; 1634 1635 assert(EndIndex > BeginIndex && "Empty vector!"); 1636 uint64_t NumElements = EndIndex - BeginIndex; 1637 Type *SliceTy = 1638 (NumElements == 1) ? Ty->getElementType() 1639 : VectorType::get(Ty->getElementType(), NumElements); 1640 1641 Type *SplitIntTy = 1642 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); 1643 1644 Use *U = I->getUse(); 1645 1646 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 1647 if (MI->isVolatile()) 1648 return false; 1649 if (!I->isSplittable()) 1650 return false; // Skip any unsplittable intrinsics. 1651 } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { 1652 // Disable vector promotion when there are loads or stores of an FCA. 1653 return false; 1654 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 1655 if (LI->isVolatile()) 1656 return false; 1657 Type *LTy = LI->getType(); 1658 if (SliceBeginOffset > I->beginOffset() || 1659 SliceEndOffset < I->endOffset()) { 1660 assert(LTy->isIntegerTy()); 1661 LTy = SplitIntTy; 1662 } 1663 if (!canConvertValue(DL, SliceTy, LTy)) 1664 return false; 1665 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 1666 if (SI->isVolatile()) 1667 return false; 1668 Type *STy = SI->getValueOperand()->getType(); 1669 if (SliceBeginOffset > I->beginOffset() || 1670 SliceEndOffset < I->endOffset()) { 1671 assert(STy->isIntegerTy()); 1672 STy = SplitIntTy; 1673 } 1674 if (!canConvertValue(DL, STy, SliceTy)) 1675 return false; 1676 } else { 1677 return false; 1678 } 1679 1680 return true; 1681} 1682 1683/// \brief Test whether the given alloca partitioning and range of slices can be 1684/// promoted to a vector. 1685/// 1686/// This is a quick test to check whether we can rewrite a particular alloca 1687/// partition (and its newly formed alloca) into a vector alloca with only 1688/// whole-vector loads and stores such that it could be promoted to a vector 1689/// SSA value. We only can ensure this for a limited set of operations, and we 1690/// don't want to do the rewrites unless we are confident that the result will 1691/// be promotable, so we have an early test here. 1692static bool 1693isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S, 1694 uint64_t SliceBeginOffset, uint64_t SliceEndOffset, 1695 AllocaSlices::const_iterator I, 1696 AllocaSlices::const_iterator E, 1697 ArrayRef<AllocaSlices::iterator> SplitUses) { 1698 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 1699 if (!Ty) 1700 return false; 1701 1702 uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType()); 1703 1704 // While the definition of LLVM vectors is bitpacked, we don't support sizes 1705 // that aren't byte sized. 1706 if (ElementSize % 8) 1707 return false; 1708 assert((DL.getTypeSizeInBits(Ty) % 8) == 0 && 1709 "vector size not a multiple of element size?"); 1710 ElementSize /= 8; 1711 1712 for (; I != E; ++I) 1713 if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, 1714 SliceEndOffset, Ty, ElementSize, I)) 1715 return false; 1716 1717 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 1718 SUE = SplitUses.end(); 1719 SUI != SUE; ++SUI) 1720 if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, 1721 SliceEndOffset, Ty, ElementSize, *SUI)) 1722 return false; 1723 1724 return true; 1725} 1726 1727/// \brief Test whether a slice of an alloca is valid for integer widening. 1728/// 1729/// This implements the necessary checking for the \c isIntegerWideningViable 1730/// test below on a single slice of the alloca. 1731static bool isIntegerWideningViableForSlice(const DataLayout &DL, 1732 Type *AllocaTy, 1733 uint64_t AllocBeginOffset, 1734 uint64_t Size, AllocaSlices &S, 1735 AllocaSlices::const_iterator I, 1736 bool &WholeAllocaOp) { 1737 uint64_t RelBegin = I->beginOffset() - AllocBeginOffset; 1738 uint64_t RelEnd = I->endOffset() - AllocBeginOffset; 1739 1740 // We can't reasonably handle cases where the load or store extends past 1741 // the end of the aloca's type and into its padding. 1742 if (RelEnd > Size) 1743 return false; 1744 1745 Use *U = I->getUse(); 1746 1747 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 1748 if (LI->isVolatile()) 1749 return false; 1750 if (RelBegin == 0 && RelEnd == Size) 1751 WholeAllocaOp = true; 1752 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 1753 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) 1754 return false; 1755 } else if (RelBegin != 0 || RelEnd != Size || 1756 !canConvertValue(DL, AllocaTy, LI->getType())) { 1757 // Non-integer loads need to be convertible from the alloca type so that 1758 // they are promotable. 1759 return false; 1760 } 1761 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 1762 Type *ValueTy = SI->getValueOperand()->getType(); 1763 if (SI->isVolatile()) 1764 return false; 1765 if (RelBegin == 0 && RelEnd == Size) 1766 WholeAllocaOp = true; 1767 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 1768 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) 1769 return false; 1770 } else if (RelBegin != 0 || RelEnd != Size || 1771 !canConvertValue(DL, ValueTy, AllocaTy)) { 1772 // Non-integer stores need to be convertible to the alloca type so that 1773 // they are promotable. 1774 return false; 1775 } 1776 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 1777 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 1778 return false; 1779 if (!I->isSplittable()) 1780 return false; // Skip any unsplittable intrinsics. 1781 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 1782 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 1783 II->getIntrinsicID() != Intrinsic::lifetime_end) 1784 return false; 1785 } else { 1786 return false; 1787 } 1788 1789 return true; 1790} 1791 1792/// \brief Test whether the given alloca partition's integer operations can be 1793/// widened to promotable ones. 1794/// 1795/// This is a quick test to check whether we can rewrite the integer loads and 1796/// stores to a particular alloca into wider loads and stores and be able to 1797/// promote the resulting alloca. 1798static bool 1799isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, 1800 uint64_t AllocBeginOffset, AllocaSlices &S, 1801 AllocaSlices::const_iterator I, 1802 AllocaSlices::const_iterator E, 1803 ArrayRef<AllocaSlices::iterator> SplitUses) { 1804 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); 1805 // Don't create integer types larger than the maximum bitwidth. 1806 if (SizeInBits > IntegerType::MAX_INT_BITS) 1807 return false; 1808 1809 // Don't try to handle allocas with bit-padding. 1810 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy)) 1811 return false; 1812 1813 // We need to ensure that an integer type with the appropriate bitwidth can 1814 // be converted to the alloca type, whatever that is. We don't want to force 1815 // the alloca itself to have an integer type if there is a more suitable one. 1816 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 1817 if (!canConvertValue(DL, AllocaTy, IntTy) || 1818 !canConvertValue(DL, IntTy, AllocaTy)) 1819 return false; 1820 1821 uint64_t Size = DL.getTypeStoreSize(AllocaTy); 1822 1823 // While examining uses, we ensure that the alloca has a covering load or 1824 // store. We don't want to widen the integer operations only to fail to 1825 // promote due to some other unsplittable entry (which we may make splittable 1826 // later). However, if there are only splittable uses, go ahead and assume 1827 // that we cover the alloca. 1828 bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits); 1829 1830 for (; I != E; ++I) 1831 if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, 1832 S, I, WholeAllocaOp)) 1833 return false; 1834 1835 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 1836 SUE = SplitUses.end(); 1837 SUI != SUE; ++SUI) 1838 if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, 1839 S, *SUI, WholeAllocaOp)) 1840 return false; 1841 1842 return WholeAllocaOp; 1843} 1844 1845static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 1846 IntegerType *Ty, uint64_t Offset, 1847 const Twine &Name) { 1848 DEBUG(dbgs() << " start: " << *V << "\n"); 1849 IntegerType *IntTy = cast<IntegerType>(V->getType()); 1850 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 1851 "Element extends past full value"); 1852 uint64_t ShAmt = 8*Offset; 1853 if (DL.isBigEndian()) 1854 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 1855 if (ShAmt) { 1856 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 1857 DEBUG(dbgs() << " shifted: " << *V << "\n"); 1858 } 1859 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 1860 "Cannot extract to a larger integer!"); 1861 if (Ty != IntTy) { 1862 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 1863 DEBUG(dbgs() << " trunced: " << *V << "\n"); 1864 } 1865 return V; 1866} 1867 1868static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 1869 Value *V, uint64_t Offset, const Twine &Name) { 1870 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 1871 IntegerType *Ty = cast<IntegerType>(V->getType()); 1872 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 1873 "Cannot insert a larger integer!"); 1874 DEBUG(dbgs() << " start: " << *V << "\n"); 1875 if (Ty != IntTy) { 1876 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 1877 DEBUG(dbgs() << " extended: " << *V << "\n"); 1878 } 1879 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 1880 "Element store outside of alloca store"); 1881 uint64_t ShAmt = 8*Offset; 1882 if (DL.isBigEndian()) 1883 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 1884 if (ShAmt) { 1885 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 1886 DEBUG(dbgs() << " shifted: " << *V << "\n"); 1887 } 1888 1889 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 1890 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 1891 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 1892 DEBUG(dbgs() << " masked: " << *Old << "\n"); 1893 V = IRB.CreateOr(Old, V, Name + ".insert"); 1894 DEBUG(dbgs() << " inserted: " << *V << "\n"); 1895 } 1896 return V; 1897} 1898 1899static Value *extractVector(IRBuilderTy &IRB, Value *V, 1900 unsigned BeginIndex, unsigned EndIndex, 1901 const Twine &Name) { 1902 VectorType *VecTy = cast<VectorType>(V->getType()); 1903 unsigned NumElements = EndIndex - BeginIndex; 1904 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 1905 1906 if (NumElements == VecTy->getNumElements()) 1907 return V; 1908 1909 if (NumElements == 1) { 1910 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 1911 Name + ".extract"); 1912 DEBUG(dbgs() << " extract: " << *V << "\n"); 1913 return V; 1914 } 1915 1916 SmallVector<Constant*, 8> Mask; 1917 Mask.reserve(NumElements); 1918 for (unsigned i = BeginIndex; i != EndIndex; ++i) 1919 Mask.push_back(IRB.getInt32(i)); 1920 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 1921 ConstantVector::get(Mask), 1922 Name + ".extract"); 1923 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 1924 return V; 1925} 1926 1927static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 1928 unsigned BeginIndex, const Twine &Name) { 1929 VectorType *VecTy = cast<VectorType>(Old->getType()); 1930 assert(VecTy && "Can only insert a vector into a vector"); 1931 1932 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 1933 if (!Ty) { 1934 // Single element to insert. 1935 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 1936 Name + ".insert"); 1937 DEBUG(dbgs() << " insert: " << *V << "\n"); 1938 return V; 1939 } 1940 1941 assert(Ty->getNumElements() <= VecTy->getNumElements() && 1942 "Too many elements!"); 1943 if (Ty->getNumElements() == VecTy->getNumElements()) { 1944 assert(V->getType() == VecTy && "Vector type mismatch"); 1945 return V; 1946 } 1947 unsigned EndIndex = BeginIndex + Ty->getNumElements(); 1948 1949 // When inserting a smaller vector into the larger to store, we first 1950 // use a shuffle vector to widen it with undef elements, and then 1951 // a second shuffle vector to select between the loaded vector and the 1952 // incoming vector. 1953 SmallVector<Constant*, 8> Mask; 1954 Mask.reserve(VecTy->getNumElements()); 1955 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 1956 if (i >= BeginIndex && i < EndIndex) 1957 Mask.push_back(IRB.getInt32(i - BeginIndex)); 1958 else 1959 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 1960 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 1961 ConstantVector::get(Mask), 1962 Name + ".expand"); 1963 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 1964 1965 Mask.clear(); 1966 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 1967 Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex)); 1968 1969 V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend"); 1970 1971 DEBUG(dbgs() << " blend: " << *V << "\n"); 1972 return V; 1973} 1974 1975namespace { 1976/// \brief Visitor to rewrite instructions using p particular slice of an alloca 1977/// to use a new alloca. 1978/// 1979/// Also implements the rewriting to vector-based accesses when the partition 1980/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 1981/// lives here. 1982class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { 1983 // Befriend the base class so it can delegate to private visit methods. 1984 friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; 1985 typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; 1986 1987 const DataLayout &DL; 1988 AllocaSlices &S; 1989 SROA &Pass; 1990 AllocaInst &OldAI, &NewAI; 1991 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 1992 Type *NewAllocaTy; 1993 1994 // If we are rewriting an alloca partition which can be written as pure 1995 // vector operations, we stash extra information here. When VecTy is 1996 // non-null, we have some strict guarantees about the rewritten alloca: 1997 // - The new alloca is exactly the size of the vector type here. 1998 // - The accesses all either map to the entire vector or to a single 1999 // element. 2000 // - The set of accessing instructions is only one of those handled above 2001 // in isVectorPromotionViable. Generally these are the same access kinds 2002 // which are promotable via mem2reg. 2003 VectorType *VecTy; 2004 Type *ElementTy; 2005 uint64_t ElementSize; 2006 2007 // This is a convenience and flag variable that will be null unless the new 2008 // alloca's integer operations should be widened to this integer type due to 2009 // passing isIntegerWideningViable above. If it is non-null, the desired 2010 // integer type will be stored here for easy access during rewriting. 2011 IntegerType *IntTy; 2012 2013 // The original offset of the slice currently being rewritten relative to 2014 // the original alloca. 2015 uint64_t BeginOffset, EndOffset; 2016 // The new offsets of the slice currently being rewritten relative to the 2017 // original alloca. 2018 uint64_t NewBeginOffset, NewEndOffset; 2019 2020 uint64_t SliceSize; 2021 bool IsSplittable; 2022 bool IsSplit; 2023 Use *OldUse; 2024 Instruction *OldPtr; 2025 2026 // Track post-rewrite users which are PHI nodes and Selects. 2027 SmallPtrSetImpl<PHINode *> &PHIUsers; 2028 SmallPtrSetImpl<SelectInst *> &SelectUsers; 2029 2030 // Utility IR builder, whose name prefix is setup for each visited use, and 2031 // the insertion point is set to point to the user. 2032 IRBuilderTy IRB; 2033 2034public: 2035 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, 2036 AllocaInst &OldAI, AllocaInst &NewAI, 2037 uint64_t NewAllocaBeginOffset, 2038 uint64_t NewAllocaEndOffset, bool IsVectorPromotable, 2039 bool IsIntegerPromotable, 2040 SmallPtrSetImpl<PHINode *> &PHIUsers, 2041 SmallPtrSetImpl<SelectInst *> &SelectUsers) 2042 : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), 2043 NewAllocaBeginOffset(NewAllocaBeginOffset), 2044 NewAllocaEndOffset(NewAllocaEndOffset), 2045 NewAllocaTy(NewAI.getAllocatedType()), 2046 VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0), 2047 ElementTy(VecTy ? VecTy->getElementType() : 0), 2048 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), 2049 IntTy(IsIntegerPromotable 2050 ? Type::getIntNTy( 2051 NewAI.getContext(), 2052 DL.getTypeSizeInBits(NewAI.getAllocatedType())) 2053 : 0), 2054 BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), 2055 OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), 2056 IRB(NewAI.getContext(), ConstantFolder()) { 2057 if (VecTy) { 2058 assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && 2059 "Only multiple-of-8 sized vector elements are viable"); 2060 ++NumVectorized; 2061 } 2062 assert((!IsVectorPromotable && !IsIntegerPromotable) || 2063 IsVectorPromotable != IsIntegerPromotable); 2064 } 2065 2066 bool visit(AllocaSlices::const_iterator I) { 2067 bool CanSROA = true; 2068 BeginOffset = I->beginOffset(); 2069 EndOffset = I->endOffset(); 2070 IsSplittable = I->isSplittable(); 2071 IsSplit = 2072 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; 2073 2074 // Compute the intersecting offset range. 2075 assert(BeginOffset < NewAllocaEndOffset); 2076 assert(EndOffset > NewAllocaBeginOffset); 2077 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); 2078 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); 2079 2080 SliceSize = NewEndOffset - NewBeginOffset; 2081 2082 OldUse = I->getUse(); 2083 OldPtr = cast<Instruction>(OldUse->get()); 2084 2085 Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); 2086 IRB.SetInsertPoint(OldUserI); 2087 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); 2088 IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); 2089 2090 CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 2091 if (VecTy || IntTy) 2092 assert(CanSROA); 2093 return CanSROA; 2094 } 2095 2096private: 2097 // Make sure the other visit overloads are visible. 2098 using Base::visit; 2099 2100 // Every instruction which can end up as a user must have a rewrite rule. 2101 bool visitInstruction(Instruction &I) { 2102 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2103 llvm_unreachable("No rewrite rule for this instruction!"); 2104 } 2105 2106 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { 2107 // Note that the offset computation can use BeginOffset or NewBeginOffset 2108 // interchangeably for unsplit slices. 2109 assert(IsSplit || BeginOffset == NewBeginOffset); 2110 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2111 2112#ifndef NDEBUG 2113 StringRef OldName = OldPtr->getName(); 2114 // Skip through the last '.sroa.' component of the name. 2115 size_t LastSROAPrefix = OldName.rfind(".sroa."); 2116 if (LastSROAPrefix != StringRef::npos) { 2117 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); 2118 // Look for an SROA slice index. 2119 size_t IndexEnd = OldName.find_first_not_of("0123456789"); 2120 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { 2121 // Strip the index and look for the offset. 2122 OldName = OldName.substr(IndexEnd + 1); 2123 size_t OffsetEnd = OldName.find_first_not_of("0123456789"); 2124 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') 2125 // Strip the offset. 2126 OldName = OldName.substr(OffsetEnd + 1); 2127 } 2128 } 2129 // Strip any SROA suffixes as well. 2130 OldName = OldName.substr(0, OldName.find(".sroa_")); 2131#endif 2132 2133 return getAdjustedPtr(IRB, DL, &NewAI, 2134 APInt(DL.getPointerSizeInBits(), Offset), PointerTy, 2135#ifndef NDEBUG 2136 Twine(OldName) + "." 2137#else 2138 Twine() 2139#endif 2140 ); 2141 } 2142 2143 /// \brief Compute suitable alignment to access this slice of the *new* alloca. 2144 /// 2145 /// You can optionally pass a type to this routine and if that type's ABI 2146 /// alignment is itself suitable, this will return zero. 2147 unsigned getSliceAlign(Type *Ty = 0) { 2148 unsigned NewAIAlign = NewAI.getAlignment(); 2149 if (!NewAIAlign) 2150 NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); 2151 unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); 2152 return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; 2153 } 2154 2155 unsigned getIndex(uint64_t Offset) { 2156 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2157 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2158 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2159 uint32_t Index = RelOffset / ElementSize; 2160 assert(Index * ElementSize == RelOffset); 2161 return Index; 2162 } 2163 2164 void deleteIfTriviallyDead(Value *V) { 2165 Instruction *I = cast<Instruction>(V); 2166 if (isInstructionTriviallyDead(I)) 2167 Pass.DeadInsts.insert(I); 2168 } 2169 2170 Value *rewriteVectorizedLoadInst() { 2171 unsigned BeginIndex = getIndex(NewBeginOffset); 2172 unsigned EndIndex = getIndex(NewEndOffset); 2173 assert(EndIndex > BeginIndex && "Empty vector!"); 2174 2175 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2176 "load"); 2177 return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); 2178 } 2179 2180 Value *rewriteIntegerLoad(LoadInst &LI) { 2181 assert(IntTy && "We cannot insert an integer to the alloca"); 2182 assert(!LI.isVolatile()); 2183 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2184 "load"); 2185 V = convertValue(DL, IRB, V, IntTy); 2186 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2187 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2188 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) 2189 V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2190 "extract"); 2191 return V; 2192 } 2193 2194 bool visitLoadInst(LoadInst &LI) { 2195 DEBUG(dbgs() << " original: " << LI << "\n"); 2196 Value *OldOp = LI.getOperand(0); 2197 assert(OldOp == OldPtr); 2198 2199 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) 2200 : LI.getType(); 2201 bool IsPtrAdjusted = false; 2202 Value *V; 2203 if (VecTy) { 2204 V = rewriteVectorizedLoadInst(); 2205 } else if (IntTy && LI.getType()->isIntegerTy()) { 2206 V = rewriteIntegerLoad(LI); 2207 } else if (NewBeginOffset == NewAllocaBeginOffset && 2208 canConvertValue(DL, NewAllocaTy, LI.getType())) { 2209 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2210 LI.isVolatile(), LI.getName()); 2211 } else { 2212 Type *LTy = TargetTy->getPointerTo(); 2213 V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), 2214 getSliceAlign(TargetTy), LI.isVolatile(), 2215 LI.getName()); 2216 IsPtrAdjusted = true; 2217 } 2218 V = convertValue(DL, IRB, V, TargetTy); 2219 2220 if (IsSplit) { 2221 assert(!LI.isVolatile()); 2222 assert(LI.getType()->isIntegerTy() && 2223 "Only integer type loads and stores are split"); 2224 assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && 2225 "Split load isn't smaller than original load"); 2226 assert(LI.getType()->getIntegerBitWidth() == 2227 DL.getTypeStoreSizeInBits(LI.getType()) && 2228 "Non-byte-multiple bit width"); 2229 // Move the insertion point just past the load so that we can refer to it. 2230 IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); 2231 // Create a placeholder value with the same type as LI to use as the 2232 // basis for the new value. This allows us to replace the uses of LI with 2233 // the computed value, and then replace the placeholder with LI, leaving 2234 // LI only used for this computation. 2235 Value *Placeholder 2236 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 2237 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, 2238 "insert"); 2239 LI.replaceAllUsesWith(V); 2240 Placeholder->replaceAllUsesWith(&LI); 2241 delete Placeholder; 2242 } else { 2243 LI.replaceAllUsesWith(V); 2244 } 2245 2246 Pass.DeadInsts.insert(&LI); 2247 deleteIfTriviallyDead(OldOp); 2248 DEBUG(dbgs() << " to: " << *V << "\n"); 2249 return !LI.isVolatile() && !IsPtrAdjusted; 2250 } 2251 2252 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { 2253 if (V->getType() != VecTy) { 2254 unsigned BeginIndex = getIndex(NewBeginOffset); 2255 unsigned EndIndex = getIndex(NewEndOffset); 2256 assert(EndIndex > BeginIndex && "Empty vector!"); 2257 unsigned NumElements = EndIndex - BeginIndex; 2258 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2259 Type *SliceTy = 2260 (NumElements == 1) ? ElementTy 2261 : VectorType::get(ElementTy, NumElements); 2262 if (V->getType() != SliceTy) 2263 V = convertValue(DL, IRB, V, SliceTy); 2264 2265 // Mix in the existing elements. 2266 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2267 "load"); 2268 V = insertVector(IRB, Old, V, BeginIndex, "vec"); 2269 } 2270 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2271 Pass.DeadInsts.insert(&SI); 2272 2273 (void)Store; 2274 DEBUG(dbgs() << " to: " << *Store << "\n"); 2275 return true; 2276 } 2277 2278 bool rewriteIntegerStore(Value *V, StoreInst &SI) { 2279 assert(IntTy && "We cannot extract an integer from the alloca"); 2280 assert(!SI.isVolatile()); 2281 if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2282 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2283 "oldload"); 2284 Old = convertValue(DL, IRB, Old, IntTy); 2285 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2286 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2287 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, 2288 "insert"); 2289 } 2290 V = convertValue(DL, IRB, V, NewAllocaTy); 2291 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2292 Pass.DeadInsts.insert(&SI); 2293 (void)Store; 2294 DEBUG(dbgs() << " to: " << *Store << "\n"); 2295 return true; 2296 } 2297 2298 bool visitStoreInst(StoreInst &SI) { 2299 DEBUG(dbgs() << " original: " << SI << "\n"); 2300 Value *OldOp = SI.getOperand(1); 2301 assert(OldOp == OldPtr); 2302 2303 Value *V = SI.getValueOperand(); 2304 2305 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2306 // alloca that should be re-examined after promoting this alloca. 2307 if (V->getType()->isPointerTy()) 2308 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2309 Pass.PostPromotionWorklist.insert(AI); 2310 2311 if (SliceSize < DL.getTypeStoreSize(V->getType())) { 2312 assert(!SI.isVolatile()); 2313 assert(V->getType()->isIntegerTy() && 2314 "Only integer type loads and stores are split"); 2315 assert(V->getType()->getIntegerBitWidth() == 2316 DL.getTypeStoreSizeInBits(V->getType()) && 2317 "Non-byte-multiple bit width"); 2318 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); 2319 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, 2320 "extract"); 2321 } 2322 2323 if (VecTy) 2324 return rewriteVectorizedStoreInst(V, SI, OldOp); 2325 if (IntTy && V->getType()->isIntegerTy()) 2326 return rewriteIntegerStore(V, SI); 2327 2328 StoreInst *NewSI; 2329 if (NewBeginOffset == NewAllocaBeginOffset && 2330 NewEndOffset == NewAllocaEndOffset && 2331 canConvertValue(DL, V->getType(), NewAllocaTy)) { 2332 V = convertValue(DL, IRB, V, NewAllocaTy); 2333 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2334 SI.isVolatile()); 2335 } else { 2336 Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); 2337 NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), 2338 SI.isVolatile()); 2339 } 2340 (void)NewSI; 2341 Pass.DeadInsts.insert(&SI); 2342 deleteIfTriviallyDead(OldOp); 2343 2344 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2345 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 2346 } 2347 2348 /// \brief Compute an integer value from splatting an i8 across the given 2349 /// number of bytes. 2350 /// 2351 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 2352 /// call this routine. 2353 /// FIXME: Heed the advice above. 2354 /// 2355 /// \param V The i8 value to splat. 2356 /// \param Size The number of bytes in the output (assuming i8 is one byte) 2357 Value *getIntegerSplat(Value *V, unsigned Size) { 2358 assert(Size > 0 && "Expected a positive number of bytes."); 2359 IntegerType *VTy = cast<IntegerType>(V->getType()); 2360 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 2361 if (Size == 1) 2362 return V; 2363 2364 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2365 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"), 2366 ConstantExpr::getUDiv( 2367 Constant::getAllOnesValue(SplatIntTy), 2368 ConstantExpr::getZExt( 2369 Constant::getAllOnesValue(V->getType()), 2370 SplatIntTy)), 2371 "isplat"); 2372 return V; 2373 } 2374 2375 /// \brief Compute a vector splat for a given element value. 2376 Value *getVectorSplat(Value *V, unsigned NumElements) { 2377 V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); 2378 DEBUG(dbgs() << " splat: " << *V << "\n"); 2379 return V; 2380 } 2381 2382 bool visitMemSetInst(MemSetInst &II) { 2383 DEBUG(dbgs() << " original: " << II << "\n"); 2384 assert(II.getRawDest() == OldPtr); 2385 2386 // If the memset has a variable size, it cannot be split, just adjust the 2387 // pointer to the new alloca. 2388 if (!isa<Constant>(II.getLength())) { 2389 assert(!IsSplit); 2390 assert(NewBeginOffset == BeginOffset); 2391 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); 2392 Type *CstTy = II.getAlignmentCst()->getType(); 2393 II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); 2394 2395 deleteIfTriviallyDead(OldPtr); 2396 return false; 2397 } 2398 2399 // Record this instruction for deletion. 2400 Pass.DeadInsts.insert(&II); 2401 2402 Type *AllocaTy = NewAI.getAllocatedType(); 2403 Type *ScalarTy = AllocaTy->getScalarType(); 2404 2405 // If this doesn't map cleanly onto the alloca type, and that type isn't 2406 // a single value type, just emit a memset. 2407 if (!VecTy && !IntTy && 2408 (BeginOffset > NewAllocaBeginOffset || 2409 EndOffset < NewAllocaEndOffset || 2410 !AllocaTy->isSingleValueType() || 2411 !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || 2412 DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { 2413 Type *SizeTy = II.getLength()->getType(); 2414 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 2415 CallInst *New = IRB.CreateMemSet( 2416 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, 2417 getSliceAlign(), II.isVolatile()); 2418 (void)New; 2419 DEBUG(dbgs() << " to: " << *New << "\n"); 2420 return false; 2421 } 2422 2423 // If we can represent this as a simple value, we have to build the actual 2424 // value to store, which requires expanding the byte present in memset to 2425 // a sensible representation for the alloca type. This is essentially 2426 // splatting the byte to a sufficiently wide integer, splatting it across 2427 // any desired vector width, and bitcasting to the final type. 2428 Value *V; 2429 2430 if (VecTy) { 2431 // If this is a memset of a vectorized alloca, insert it. 2432 assert(ElementTy == ScalarTy); 2433 2434 unsigned BeginIndex = getIndex(NewBeginOffset); 2435 unsigned EndIndex = getIndex(NewEndOffset); 2436 assert(EndIndex > BeginIndex && "Empty vector!"); 2437 unsigned NumElements = EndIndex - BeginIndex; 2438 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2439 2440 Value *Splat = 2441 getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8); 2442 Splat = convertValue(DL, IRB, Splat, ElementTy); 2443 if (NumElements > 1) 2444 Splat = getVectorSplat(Splat, NumElements); 2445 2446 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2447 "oldload"); 2448 V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); 2449 } else if (IntTy) { 2450 // If this is a memset on an alloca where we can widen stores, insert the 2451 // set integer. 2452 assert(!II.isVolatile()); 2453 2454 uint64_t Size = NewEndOffset - NewBeginOffset; 2455 V = getIntegerSplat(II.getValue(), Size); 2456 2457 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 2458 EndOffset != NewAllocaBeginOffset)) { 2459 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2460 "oldload"); 2461 Old = convertValue(DL, IRB, Old, IntTy); 2462 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2463 V = insertInteger(DL, IRB, Old, V, Offset, "insert"); 2464 } else { 2465 assert(V->getType() == IntTy && 2466 "Wrong type for an alloca wide integer!"); 2467 } 2468 V = convertValue(DL, IRB, V, AllocaTy); 2469 } else { 2470 // Established these invariants above. 2471 assert(NewBeginOffset == NewAllocaBeginOffset); 2472 assert(NewEndOffset == NewAllocaEndOffset); 2473 2474 V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8); 2475 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 2476 V = getVectorSplat(V, AllocaVecTy->getNumElements()); 2477 2478 V = convertValue(DL, IRB, V, AllocaTy); 2479 } 2480 2481 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2482 II.isVolatile()); 2483 (void)New; 2484 DEBUG(dbgs() << " to: " << *New << "\n"); 2485 return !II.isVolatile(); 2486 } 2487 2488 bool visitMemTransferInst(MemTransferInst &II) { 2489 // Rewriting of memory transfer instructions can be a bit tricky. We break 2490 // them into two categories: split intrinsics and unsplit intrinsics. 2491 2492 DEBUG(dbgs() << " original: " << II << "\n"); 2493 2494 bool IsDest = &II.getRawDestUse() == OldUse; 2495 assert((IsDest && II.getRawDest() == OldPtr) || 2496 (!IsDest && II.getRawSource() == OldPtr)); 2497 2498 unsigned SliceAlign = getSliceAlign(); 2499 2500 // For unsplit intrinsics, we simply modify the source and destination 2501 // pointers in place. This isn't just an optimization, it is a matter of 2502 // correctness. With unsplit intrinsics we may be dealing with transfers 2503 // within a single alloca before SROA ran, or with transfers that have 2504 // a variable length. We may also be dealing with memmove instead of 2505 // memcpy, and so simply updating the pointers is the necessary for us to 2506 // update both source and dest of a single call. 2507 if (!IsSplittable) { 2508 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 2509 if (IsDest) 2510 II.setDest(AdjustedPtr); 2511 else 2512 II.setSource(AdjustedPtr); 2513 2514 if (II.getAlignment() > SliceAlign) { 2515 Type *CstTy = II.getAlignmentCst()->getType(); 2516 II.setAlignment( 2517 ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); 2518 } 2519 2520 DEBUG(dbgs() << " to: " << II << "\n"); 2521 deleteIfTriviallyDead(OldPtr); 2522 return false; 2523 } 2524 // For split transfer intrinsics we have an incredibly useful assurance: 2525 // the source and destination do not reside within the same alloca, and at 2526 // least one of them does not escape. This means that we can replace 2527 // memmove with memcpy, and we don't need to worry about all manner of 2528 // downsides to splitting and transforming the operations. 2529 2530 // If this doesn't map cleanly onto the alloca type, and that type isn't 2531 // a single value type, just emit a memcpy. 2532 bool EmitMemCpy 2533 = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || 2534 EndOffset < NewAllocaEndOffset || 2535 !NewAI.getAllocatedType()->isSingleValueType()); 2536 2537 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2538 // size hasn't been shrunk based on analysis of the viable range, this is 2539 // a no-op. 2540 if (EmitMemCpy && &OldAI == &NewAI) { 2541 // Ensure the start lines up. 2542 assert(NewBeginOffset == BeginOffset); 2543 2544 // Rewrite the size as needed. 2545 if (NewEndOffset != EndOffset) 2546 II.setLength(ConstantInt::get(II.getLength()->getType(), 2547 NewEndOffset - NewBeginOffset)); 2548 return false; 2549 } 2550 // Record this instruction for deletion. 2551 Pass.DeadInsts.insert(&II); 2552 2553 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2554 // alloca that should be re-examined after rewriting this instruction. 2555 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2556 if (AllocaInst *AI 2557 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { 2558 assert(AI != &OldAI && AI != &NewAI && 2559 "Splittable transfers cannot reach the same alloca on both ends."); 2560 Pass.Worklist.insert(AI); 2561 } 2562 2563 Type *OtherPtrTy = OtherPtr->getType(); 2564 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); 2565 2566 // Compute the relative offset for the other pointer within the transfer. 2567 unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); 2568 APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); 2569 unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, 2570 OtherOffset.zextOrTrunc(64).getZExtValue()); 2571 2572 if (EmitMemCpy) { 2573 // Compute the other pointer, folding as much as possible to produce 2574 // a single, simple GEP in most cases. 2575 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 2576 OtherPtr->getName() + "."); 2577 2578 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 2579 Type *SizeTy = II.getLength()->getType(); 2580 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 2581 2582 CallInst *New = IRB.CreateMemCpy( 2583 IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, 2584 MinAlign(SliceAlign, OtherAlign), II.isVolatile()); 2585 (void)New; 2586 DEBUG(dbgs() << " to: " << *New << "\n"); 2587 return false; 2588 } 2589 2590 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && 2591 NewEndOffset == NewAllocaEndOffset; 2592 uint64_t Size = NewEndOffset - NewBeginOffset; 2593 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; 2594 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; 2595 unsigned NumElements = EndIndex - BeginIndex; 2596 IntegerType *SubIntTy 2597 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2598 2599 // Reset the other pointer type to match the register type we're going to 2600 // use, but using the address space of the original other pointer. 2601 if (VecTy && !IsWholeAlloca) { 2602 if (NumElements == 1) 2603 OtherPtrTy = VecTy->getElementType(); 2604 else 2605 OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); 2606 2607 OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS); 2608 } else if (IntTy && !IsWholeAlloca) { 2609 OtherPtrTy = SubIntTy->getPointerTo(OtherAS); 2610 } else { 2611 OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS); 2612 } 2613 2614 Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 2615 OtherPtr->getName() + "."); 2616 unsigned SrcAlign = OtherAlign; 2617 Value *DstPtr = &NewAI; 2618 unsigned DstAlign = SliceAlign; 2619 if (!IsDest) { 2620 std::swap(SrcPtr, DstPtr); 2621 std::swap(SrcAlign, DstAlign); 2622 } 2623 2624 Value *Src; 2625 if (VecTy && !IsWholeAlloca && !IsDest) { 2626 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2627 "load"); 2628 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); 2629 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2630 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2631 "load"); 2632 Src = convertValue(DL, IRB, Src, IntTy); 2633 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2634 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); 2635 } else { 2636 Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), 2637 "copyload"); 2638 } 2639 2640 if (VecTy && !IsWholeAlloca && IsDest) { 2641 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2642 "oldload"); 2643 Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); 2644 } else if (IntTy && !IsWholeAlloca && IsDest) { 2645 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2646 "oldload"); 2647 Old = convertValue(DL, IRB, Old, IntTy); 2648 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2649 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); 2650 Src = convertValue(DL, IRB, Src, NewAllocaTy); 2651 } 2652 2653 StoreInst *Store = cast<StoreInst>( 2654 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); 2655 (void)Store; 2656 DEBUG(dbgs() << " to: " << *Store << "\n"); 2657 return !II.isVolatile(); 2658 } 2659 2660 bool visitIntrinsicInst(IntrinsicInst &II) { 2661 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2662 II.getIntrinsicID() == Intrinsic::lifetime_end); 2663 DEBUG(dbgs() << " original: " << II << "\n"); 2664 assert(II.getArgOperand(1) == OldPtr); 2665 2666 // Record this instruction for deletion. 2667 Pass.DeadInsts.insert(&II); 2668 2669 ConstantInt *Size 2670 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2671 NewEndOffset - NewBeginOffset); 2672 Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 2673 Value *New; 2674 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2675 New = IRB.CreateLifetimeStart(Ptr, Size); 2676 else 2677 New = IRB.CreateLifetimeEnd(Ptr, Size); 2678 2679 (void)New; 2680 DEBUG(dbgs() << " to: " << *New << "\n"); 2681 return true; 2682 } 2683 2684 bool visitPHINode(PHINode &PN) { 2685 DEBUG(dbgs() << " original: " << PN << "\n"); 2686 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); 2687 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); 2688 2689 // We would like to compute a new pointer in only one place, but have it be 2690 // as local as possible to the PHI. To do that, we re-use the location of 2691 // the old pointer, which necessarily must be in the right position to 2692 // dominate the PHI. 2693 IRBuilderTy PtrBuilder(IRB); 2694 PtrBuilder.SetInsertPoint(OldPtr); 2695 PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); 2696 2697 Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType()); 2698 // Replace the operands which were using the old pointer. 2699 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 2700 2701 DEBUG(dbgs() << " to: " << PN << "\n"); 2702 deleteIfTriviallyDead(OldPtr); 2703 2704 // PHIs can't be promoted on their own, but often can be speculated. We 2705 // check the speculation outside of the rewriter so that we see the 2706 // fully-rewritten alloca. 2707 PHIUsers.insert(&PN); 2708 return true; 2709 } 2710 2711 bool visitSelectInst(SelectInst &SI) { 2712 DEBUG(dbgs() << " original: " << SI << "\n"); 2713 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && 2714 "Pointer isn't an operand!"); 2715 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); 2716 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); 2717 2718 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 2719 // Replace the operands which were using the old pointer. 2720 if (SI.getOperand(1) == OldPtr) 2721 SI.setOperand(1, NewPtr); 2722 if (SI.getOperand(2) == OldPtr) 2723 SI.setOperand(2, NewPtr); 2724 2725 DEBUG(dbgs() << " to: " << SI << "\n"); 2726 deleteIfTriviallyDead(OldPtr); 2727 2728 // Selects can't be promoted on their own, but often can be speculated. We 2729 // check the speculation outside of the rewriter so that we see the 2730 // fully-rewritten alloca. 2731 SelectUsers.insert(&SI); 2732 return true; 2733 } 2734 2735}; 2736} 2737 2738namespace { 2739/// \brief Visitor to rewrite aggregate loads and stores as scalar. 2740/// 2741/// This pass aggressively rewrites all aggregate loads and stores on 2742/// a particular pointer (or any pointer derived from it which we can identify) 2743/// with scalar loads and stores. 2744class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 2745 // Befriend the base class so it can delegate to private visit methods. 2746 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 2747 2748 const DataLayout &DL; 2749 2750 /// Queue of pointer uses to analyze and potentially rewrite. 2751 SmallVector<Use *, 8> Queue; 2752 2753 /// Set to prevent us from cycling with phi nodes and loops. 2754 SmallPtrSet<User *, 8> Visited; 2755 2756 /// The current pointer use being rewritten. This is used to dig up the used 2757 /// value (as opposed to the user). 2758 Use *U; 2759 2760public: 2761 AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} 2762 2763 /// Rewrite loads and stores through a pointer and all pointers derived from 2764 /// it. 2765 bool rewrite(Instruction &I) { 2766 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 2767 enqueueUsers(I); 2768 bool Changed = false; 2769 while (!Queue.empty()) { 2770 U = Queue.pop_back_val(); 2771 Changed |= visit(cast<Instruction>(U->getUser())); 2772 } 2773 return Changed; 2774 } 2775 2776private: 2777 /// Enqueue all the users of the given instruction for further processing. 2778 /// This uses a set to de-duplicate users. 2779 void enqueueUsers(Instruction &I) { 2780 for (Use &U : I.uses()) 2781 if (Visited.insert(U.getUser())) 2782 Queue.push_back(&U); 2783 } 2784 2785 // Conservative default is to not rewrite anything. 2786 bool visitInstruction(Instruction &I) { return false; } 2787 2788 /// \brief Generic recursive split emission class. 2789 template <typename Derived> 2790 class OpSplitter { 2791 protected: 2792 /// The builder used to form new instructions. 2793 IRBuilderTy IRB; 2794 /// The indices which to be used with insert- or extractvalue to select the 2795 /// appropriate value within the aggregate. 2796 SmallVector<unsigned, 4> Indices; 2797 /// The indices to a GEP instruction which will move Ptr to the correct slot 2798 /// within the aggregate. 2799 SmallVector<Value *, 4> GEPIndices; 2800 /// The base pointer of the original op, used as a base for GEPing the 2801 /// split operations. 2802 Value *Ptr; 2803 2804 /// Initialize the splitter with an insertion point, Ptr and start with a 2805 /// single zero GEP index. 2806 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 2807 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 2808 2809 public: 2810 /// \brief Generic recursive split emission routine. 2811 /// 2812 /// This method recursively splits an aggregate op (load or store) into 2813 /// scalar or vector ops. It splits recursively until it hits a single value 2814 /// and emits that single value operation via the template argument. 2815 /// 2816 /// The logic of this routine relies on GEPs and insertvalue and 2817 /// extractvalue all operating with the same fundamental index list, merely 2818 /// formatted differently (GEPs need actual values). 2819 /// 2820 /// \param Ty The type being split recursively into smaller ops. 2821 /// \param Agg The aggregate value being built up or stored, depending on 2822 /// whether this is splitting a load or a store respectively. 2823 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 2824 if (Ty->isSingleValueType()) 2825 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 2826 2827 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 2828 unsigned OldSize = Indices.size(); 2829 (void)OldSize; 2830 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 2831 ++Idx) { 2832 assert(Indices.size() == OldSize && "Did not return to the old size"); 2833 Indices.push_back(Idx); 2834 GEPIndices.push_back(IRB.getInt32(Idx)); 2835 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 2836 GEPIndices.pop_back(); 2837 Indices.pop_back(); 2838 } 2839 return; 2840 } 2841 2842 if (StructType *STy = dyn_cast<StructType>(Ty)) { 2843 unsigned OldSize = Indices.size(); 2844 (void)OldSize; 2845 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 2846 ++Idx) { 2847 assert(Indices.size() == OldSize && "Did not return to the old size"); 2848 Indices.push_back(Idx); 2849 GEPIndices.push_back(IRB.getInt32(Idx)); 2850 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 2851 GEPIndices.pop_back(); 2852 Indices.pop_back(); 2853 } 2854 return; 2855 } 2856 2857 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 2858 } 2859 }; 2860 2861 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 2862 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 2863 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 2864 2865 /// Emit a leaf load of a single value. This is called at the leaves of the 2866 /// recursive emission to actually load values. 2867 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 2868 assert(Ty->isSingleValueType()); 2869 // Load the single value and insert it using the indices. 2870 Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); 2871 Value *Load = IRB.CreateLoad(GEP, Name + ".load"); 2872 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 2873 DEBUG(dbgs() << " to: " << *Load << "\n"); 2874 } 2875 }; 2876 2877 bool visitLoadInst(LoadInst &LI) { 2878 assert(LI.getPointerOperand() == *U); 2879 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 2880 return false; 2881 2882 // We have an aggregate being loaded, split it apart. 2883 DEBUG(dbgs() << " original: " << LI << "\n"); 2884 LoadOpSplitter Splitter(&LI, *U); 2885 Value *V = UndefValue::get(LI.getType()); 2886 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 2887 LI.replaceAllUsesWith(V); 2888 LI.eraseFromParent(); 2889 return true; 2890 } 2891 2892 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 2893 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 2894 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 2895 2896 /// Emit a leaf store of a single value. This is called at the leaves of the 2897 /// recursive emission to actually produce stores. 2898 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 2899 assert(Ty->isSingleValueType()); 2900 // Extract the single value and store it using the indices. 2901 Value *Store = IRB.CreateStore( 2902 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 2903 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 2904 (void)Store; 2905 DEBUG(dbgs() << " to: " << *Store << "\n"); 2906 } 2907 }; 2908 2909 bool visitStoreInst(StoreInst &SI) { 2910 if (!SI.isSimple() || SI.getPointerOperand() != *U) 2911 return false; 2912 Value *V = SI.getValueOperand(); 2913 if (V->getType()->isSingleValueType()) 2914 return false; 2915 2916 // We have an aggregate being stored, split it apart. 2917 DEBUG(dbgs() << " original: " << SI << "\n"); 2918 StoreOpSplitter Splitter(&SI, *U); 2919 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 2920 SI.eraseFromParent(); 2921 return true; 2922 } 2923 2924 bool visitBitCastInst(BitCastInst &BC) { 2925 enqueueUsers(BC); 2926 return false; 2927 } 2928 2929 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 2930 enqueueUsers(GEPI); 2931 return false; 2932 } 2933 2934 bool visitPHINode(PHINode &PN) { 2935 enqueueUsers(PN); 2936 return false; 2937 } 2938 2939 bool visitSelectInst(SelectInst &SI) { 2940 enqueueUsers(SI); 2941 return false; 2942 } 2943}; 2944} 2945 2946/// \brief Strip aggregate type wrapping. 2947/// 2948/// This removes no-op aggregate types wrapping an underlying type. It will 2949/// strip as many layers of types as it can without changing either the type 2950/// size or the allocated size. 2951static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 2952 if (Ty->isSingleValueType()) 2953 return Ty; 2954 2955 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 2956 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 2957 2958 Type *InnerTy; 2959 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 2960 InnerTy = ArrTy->getElementType(); 2961 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 2962 const StructLayout *SL = DL.getStructLayout(STy); 2963 unsigned Index = SL->getElementContainingOffset(0); 2964 InnerTy = STy->getElementType(Index); 2965 } else { 2966 return Ty; 2967 } 2968 2969 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 2970 TypeSize > DL.getTypeSizeInBits(InnerTy)) 2971 return Ty; 2972 2973 return stripAggregateTypeWrapping(DL, InnerTy); 2974} 2975 2976/// \brief Try to find a partition of the aggregate type passed in for a given 2977/// offset and size. 2978/// 2979/// This recurses through the aggregate type and tries to compute a subtype 2980/// based on the offset and size. When the offset and size span a sub-section 2981/// of an array, it will even compute a new array type for that sub-section, 2982/// and the same for structs. 2983/// 2984/// Note that this routine is very strict and tries to find a partition of the 2985/// type which produces the *exact* right offset and size. It is not forgiving 2986/// when the size or offset cause either end of type-based partition to be off. 2987/// Also, this is a best-effort routine. It is reasonable to give up and not 2988/// return a type if necessary. 2989static Type *getTypePartition(const DataLayout &DL, Type *Ty, 2990 uint64_t Offset, uint64_t Size) { 2991 if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) 2992 return stripAggregateTypeWrapping(DL, Ty); 2993 if (Offset > DL.getTypeAllocSize(Ty) || 2994 (DL.getTypeAllocSize(Ty) - Offset) < Size) 2995 return 0; 2996 2997 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 2998 // We can't partition pointers... 2999 if (SeqTy->isPointerTy()) 3000 return 0; 3001 3002 Type *ElementTy = SeqTy->getElementType(); 3003 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); 3004 uint64_t NumSkippedElements = Offset / ElementSize; 3005 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { 3006 if (NumSkippedElements >= ArrTy->getNumElements()) 3007 return 0; 3008 } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) { 3009 if (NumSkippedElements >= VecTy->getNumElements()) 3010 return 0; 3011 } 3012 Offset -= NumSkippedElements * ElementSize; 3013 3014 // First check if we need to recurse. 3015 if (Offset > 0 || Size < ElementSize) { 3016 // Bail if the partition ends in a different array element. 3017 if ((Offset + Size) > ElementSize) 3018 return 0; 3019 // Recurse through the element type trying to peel off offset bytes. 3020 return getTypePartition(DL, ElementTy, Offset, Size); 3021 } 3022 assert(Offset == 0); 3023 3024 if (Size == ElementSize) 3025 return stripAggregateTypeWrapping(DL, ElementTy); 3026 assert(Size > ElementSize); 3027 uint64_t NumElements = Size / ElementSize; 3028 if (NumElements * ElementSize != Size) 3029 return 0; 3030 return ArrayType::get(ElementTy, NumElements); 3031 } 3032 3033 StructType *STy = dyn_cast<StructType>(Ty); 3034 if (!STy) 3035 return 0; 3036 3037 const StructLayout *SL = DL.getStructLayout(STy); 3038 if (Offset >= SL->getSizeInBytes()) 3039 return 0; 3040 uint64_t EndOffset = Offset + Size; 3041 if (EndOffset > SL->getSizeInBytes()) 3042 return 0; 3043 3044 unsigned Index = SL->getElementContainingOffset(Offset); 3045 Offset -= SL->getElementOffset(Index); 3046 3047 Type *ElementTy = STy->getElementType(Index); 3048 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); 3049 if (Offset >= ElementSize) 3050 return 0; // The offset points into alignment padding. 3051 3052 // See if any partition must be contained by the element. 3053 if (Offset > 0 || Size < ElementSize) { 3054 if ((Offset + Size) > ElementSize) 3055 return 0; 3056 return getTypePartition(DL, ElementTy, Offset, Size); 3057 } 3058 assert(Offset == 0); 3059 3060 if (Size == ElementSize) 3061 return stripAggregateTypeWrapping(DL, ElementTy); 3062 3063 StructType::element_iterator EI = STy->element_begin() + Index, 3064 EE = STy->element_end(); 3065 if (EndOffset < SL->getSizeInBytes()) { 3066 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3067 if (Index == EndIndex) 3068 return 0; // Within a single element and its padding. 3069 3070 // Don't try to form "natural" types if the elements don't line up with the 3071 // expected size. 3072 // FIXME: We could potentially recurse down through the last element in the 3073 // sub-struct to find a natural end point. 3074 if (SL->getElementOffset(EndIndex) != EndOffset) 3075 return 0; 3076 3077 assert(Index < EndIndex); 3078 EE = STy->element_begin() + EndIndex; 3079 } 3080 3081 // Try to build up a sub-structure. 3082 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3083 STy->isPacked()); 3084 const StructLayout *SubSL = DL.getStructLayout(SubTy); 3085 if (Size != SubSL->getSizeInBytes()) 3086 return 0; // The sub-struct doesn't have quite the size needed. 3087 3088 return SubTy; 3089} 3090 3091/// \brief Rewrite an alloca partition's users. 3092/// 3093/// This routine drives both of the rewriting goals of the SROA pass. It tries 3094/// to rewrite uses of an alloca partition to be conducive for SSA value 3095/// promotion. If the partition needs a new, more refined alloca, this will 3096/// build that new alloca, preserving as much type information as possible, and 3097/// rewrite the uses of the old alloca to point at the new one and have the 3098/// appropriate new offsets. It also evaluates how successful the rewrite was 3099/// at enabling promotion and if it was successful queues the alloca to be 3100/// promoted. 3101bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, 3102 AllocaSlices::iterator B, AllocaSlices::iterator E, 3103 int64_t BeginOffset, int64_t EndOffset, 3104 ArrayRef<AllocaSlices::iterator> SplitUses) { 3105 assert(BeginOffset < EndOffset); 3106 uint64_t SliceSize = EndOffset - BeginOffset; 3107 3108 // Try to compute a friendly type for this partition of the alloca. This 3109 // won't always succeed, in which case we fall back to a legal integer type 3110 // or an i8 array of an appropriate size. 3111 Type *SliceTy = 0; 3112 if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) 3113 if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) 3114 SliceTy = CommonUseTy; 3115 if (!SliceTy) 3116 if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), 3117 BeginOffset, SliceSize)) 3118 SliceTy = TypePartitionTy; 3119 if ((!SliceTy || (SliceTy->isArrayTy() && 3120 SliceTy->getArrayElementType()->isIntegerTy())) && 3121 DL->isLegalInteger(SliceSize * 8)) 3122 SliceTy = Type::getIntNTy(*C, SliceSize * 8); 3123 if (!SliceTy) 3124 SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); 3125 assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); 3126 3127 bool IsVectorPromotable = isVectorPromotionViable( 3128 *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses); 3129 3130 bool IsIntegerPromotable = 3131 !IsVectorPromotable && 3132 isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses); 3133 3134 // Check for the case where we're going to rewrite to a new alloca of the 3135 // exact same type as the original, and with the same access offsets. In that 3136 // case, re-use the existing alloca, but still run through the rewriter to 3137 // perform phi and select speculation. 3138 AllocaInst *NewAI; 3139 if (SliceTy == AI.getAllocatedType()) { 3140 assert(BeginOffset == 0 && 3141 "Non-zero begin offset but same alloca type"); 3142 NewAI = &AI; 3143 // FIXME: We should be able to bail at this point with "nothing changed". 3144 // FIXME: We might want to defer PHI speculation until after here. 3145 } else { 3146 unsigned Alignment = AI.getAlignment(); 3147 if (!Alignment) { 3148 // The minimum alignment which users can rely on when the explicit 3149 // alignment is omitted or zero is that required by the ABI for this 3150 // type. 3151 Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); 3152 } 3153 Alignment = MinAlign(Alignment, BeginOffset); 3154 // If we will get at least this much alignment from the type alone, leave 3155 // the alloca's alignment unconstrained. 3156 if (Alignment <= DL->getABITypeAlignment(SliceTy)) 3157 Alignment = 0; 3158 NewAI = new AllocaInst(SliceTy, 0, Alignment, 3159 AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); 3160 ++NumNewAllocas; 3161 } 3162 3163 DEBUG(dbgs() << "Rewriting alloca partition " 3164 << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI 3165 << "\n"); 3166 3167 // Track the high watermark on the worklist as it is only relevant for 3168 // promoted allocas. We will reset it to this point if the alloca is not in 3169 // fact scheduled for promotion. 3170 unsigned PPWOldSize = PostPromotionWorklist.size(); 3171 unsigned NumUses = 0; 3172 SmallPtrSet<PHINode *, 8> PHIUsers; 3173 SmallPtrSet<SelectInst *, 8> SelectUsers; 3174 3175 AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, 3176 EndOffset, IsVectorPromotable, 3177 IsIntegerPromotable, PHIUsers, SelectUsers); 3178 bool Promotable = true; 3179 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 3180 SUE = SplitUses.end(); 3181 SUI != SUE; ++SUI) { 3182 DEBUG(dbgs() << " rewriting split "); 3183 DEBUG(S.printSlice(dbgs(), *SUI, "")); 3184 Promotable &= Rewriter.visit(*SUI); 3185 ++NumUses; 3186 } 3187 for (AllocaSlices::iterator I = B; I != E; ++I) { 3188 DEBUG(dbgs() << " rewriting "); 3189 DEBUG(S.printSlice(dbgs(), I, "")); 3190 Promotable &= Rewriter.visit(I); 3191 ++NumUses; 3192 } 3193 3194 NumAllocaPartitionUses += NumUses; 3195 MaxUsesPerAllocaPartition = 3196 std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); 3197 3198 // Now that we've processed all the slices in the new partition, check if any 3199 // PHIs or Selects would block promotion. 3200 for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), 3201 E = PHIUsers.end(); 3202 I != E; ++I) 3203 if (!isSafePHIToSpeculate(**I, DL)) { 3204 Promotable = false; 3205 PHIUsers.clear(); 3206 SelectUsers.clear(); 3207 break; 3208 } 3209 for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), 3210 E = SelectUsers.end(); 3211 I != E; ++I) 3212 if (!isSafeSelectToSpeculate(**I, DL)) { 3213 Promotable = false; 3214 PHIUsers.clear(); 3215 SelectUsers.clear(); 3216 break; 3217 } 3218 3219 if (Promotable) { 3220 if (PHIUsers.empty() && SelectUsers.empty()) { 3221 // Promote the alloca. 3222 PromotableAllocas.push_back(NewAI); 3223 } else { 3224 // If we have either PHIs or Selects to speculate, add them to those 3225 // worklists and re-queue the new alloca so that we promote in on the 3226 // next iteration. 3227 for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), 3228 E = PHIUsers.end(); 3229 I != E; ++I) 3230 SpeculatablePHIs.insert(*I); 3231 for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), 3232 E = SelectUsers.end(); 3233 I != E; ++I) 3234 SpeculatableSelects.insert(*I); 3235 Worklist.insert(NewAI); 3236 } 3237 } else { 3238 // If we can't promote the alloca, iterate on it to check for new 3239 // refinements exposed by splitting the current alloca. Don't iterate on an 3240 // alloca which didn't actually change and didn't get promoted. 3241 if (NewAI != &AI) 3242 Worklist.insert(NewAI); 3243 3244 // Drop any post-promotion work items if promotion didn't happen. 3245 while (PostPromotionWorklist.size() > PPWOldSize) 3246 PostPromotionWorklist.pop_back(); 3247 } 3248 3249 return true; 3250} 3251 3252static void 3253removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, 3254 uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { 3255 if (Offset >= MaxSplitUseEndOffset) { 3256 SplitUses.clear(); 3257 MaxSplitUseEndOffset = 0; 3258 return; 3259 } 3260 3261 size_t SplitUsesOldSize = SplitUses.size(); 3262 SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), 3263 [Offset](const AllocaSlices::iterator &I) { 3264 return I->endOffset() <= Offset; 3265 }), 3266 SplitUses.end()); 3267 if (SplitUsesOldSize == SplitUses.size()) 3268 return; 3269 3270 // Recompute the max. While this is linear, so is remove_if. 3271 MaxSplitUseEndOffset = 0; 3272 for (SmallVectorImpl<AllocaSlices::iterator>::iterator 3273 SUI = SplitUses.begin(), 3274 SUE = SplitUses.end(); 3275 SUI != SUE; ++SUI) 3276 MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset); 3277} 3278 3279/// \brief Walks the slices of an alloca and form partitions based on them, 3280/// rewriting each of their uses. 3281bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { 3282 if (S.begin() == S.end()) 3283 return false; 3284 3285 unsigned NumPartitions = 0; 3286 bool Changed = false; 3287 SmallVector<AllocaSlices::iterator, 4> SplitUses; 3288 uint64_t MaxSplitUseEndOffset = 0; 3289 3290 uint64_t BeginOffset = S.begin()->beginOffset(); 3291 3292 for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end(); 3293 SI != SE; SI = SJ) { 3294 uint64_t MaxEndOffset = SI->endOffset(); 3295 3296 if (!SI->isSplittable()) { 3297 // When we're forming an unsplittable region, it must always start at the 3298 // first slice and will extend through its end. 3299 assert(BeginOffset == SI->beginOffset()); 3300 3301 // Form a partition including all of the overlapping slices with this 3302 // unsplittable slice. 3303 while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { 3304 if (!SJ->isSplittable()) 3305 MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); 3306 ++SJ; 3307 } 3308 } else { 3309 assert(SI->isSplittable()); // Established above. 3310 3311 // Collect all of the overlapping splittable slices. 3312 while (SJ != SE && SJ->beginOffset() < MaxEndOffset && 3313 SJ->isSplittable()) { 3314 MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); 3315 ++SJ; 3316 } 3317 3318 // Back up MaxEndOffset and SJ if we ended the span early when 3319 // encountering an unsplittable slice. 3320 if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { 3321 assert(!SJ->isSplittable()); 3322 MaxEndOffset = SJ->beginOffset(); 3323 } 3324 } 3325 3326 // Check if we have managed to move the end offset forward yet. If so, 3327 // we'll have to rewrite uses and erase old split uses. 3328 if (BeginOffset < MaxEndOffset) { 3329 // Rewrite a sequence of overlapping slices. 3330 Changed |= 3331 rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses); 3332 ++NumPartitions; 3333 3334 removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); 3335 } 3336 3337 // Accumulate all the splittable slices from the [SI,SJ) region which 3338 // overlap going forward. 3339 for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) 3340 if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { 3341 SplitUses.push_back(SK); 3342 MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); 3343 } 3344 3345 // If we're already at the end and we have no split uses, we're done. 3346 if (SJ == SE && SplitUses.empty()) 3347 break; 3348 3349 // If we have no split uses or no gap in offsets, we're ready to move to 3350 // the next slice. 3351 if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { 3352 BeginOffset = SJ->beginOffset(); 3353 continue; 3354 } 3355 3356 // Even if we have split slices, if the next slice is splittable and the 3357 // split slices reach it, we can simply set up the beginning offset of the 3358 // next iteration to bridge between them. 3359 if (SJ != SE && SJ->isSplittable() && 3360 MaxSplitUseEndOffset > SJ->beginOffset()) { 3361 BeginOffset = MaxEndOffset; 3362 continue; 3363 } 3364 3365 // Otherwise, we have a tail of split slices. Rewrite them with an empty 3366 // range of slices. 3367 uint64_t PostSplitEndOffset = 3368 SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); 3369 3370 Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset, 3371 SplitUses); 3372 ++NumPartitions; 3373 3374 if (SJ == SE) 3375 break; // Skip the rest, we don't need to do any cleanup. 3376 3377 removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, 3378 PostSplitEndOffset); 3379 3380 // Now just reset the begin offset for the next iteration. 3381 BeginOffset = SJ->beginOffset(); 3382 } 3383 3384 NumAllocaPartitions += NumPartitions; 3385 MaxPartitionsPerAlloca = 3386 std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); 3387 3388 return Changed; 3389} 3390 3391/// \brief Clobber a use with undef, deleting the used value if it becomes dead. 3392void SROA::clobberUse(Use &U) { 3393 Value *OldV = U; 3394 // Replace the use with an undef value. 3395 U = UndefValue::get(OldV->getType()); 3396 3397 // Check for this making an instruction dead. We have to garbage collect 3398 // all the dead instructions to ensure the uses of any alloca end up being 3399 // minimal. 3400 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3401 if (isInstructionTriviallyDead(OldI)) { 3402 DeadInsts.insert(OldI); 3403 } 3404} 3405 3406/// \brief Analyze an alloca for SROA. 3407/// 3408/// This analyzes the alloca to ensure we can reason about it, builds 3409/// the slices of the alloca, and then hands it off to be split and 3410/// rewritten as needed. 3411bool SROA::runOnAlloca(AllocaInst &AI) { 3412 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3413 ++NumAllocasAnalyzed; 3414 3415 // Special case dead allocas, as they're trivial. 3416 if (AI.use_empty()) { 3417 AI.eraseFromParent(); 3418 return true; 3419 } 3420 3421 // Skip alloca forms that this analysis can't handle. 3422 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3423 DL->getTypeAllocSize(AI.getAllocatedType()) == 0) 3424 return false; 3425 3426 bool Changed = false; 3427 3428 // First, split any FCA loads and stores touching this alloca to promote 3429 // better splitting and promotion opportunities. 3430 AggLoadStoreRewriter AggRewriter(*DL); 3431 Changed |= AggRewriter.rewrite(AI); 3432 3433 // Build the slices using a recursive instruction-visiting builder. 3434 AllocaSlices S(*DL, AI); 3435 DEBUG(S.print(dbgs())); 3436 if (S.isEscaped()) 3437 return Changed; 3438 3439 // Delete all the dead users of this alloca before splitting and rewriting it. 3440 for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), 3441 DE = S.dead_user_end(); 3442 DI != DE; ++DI) { 3443 // Free up everything used by this instruction. 3444 for (Use &DeadOp : (*DI)->operands()) 3445 clobberUse(DeadOp); 3446 3447 // Now replace the uses of this instruction. 3448 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3449 3450 // And mark it for deletion. 3451 DeadInsts.insert(*DI); 3452 Changed = true; 3453 } 3454 for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), 3455 DE = S.dead_op_end(); 3456 DO != DE; ++DO) { 3457 clobberUse(**DO); 3458 Changed = true; 3459 } 3460 3461 // No slices to split. Leave the dead alloca for a later pass to clean up. 3462 if (S.begin() == S.end()) 3463 return Changed; 3464 3465 Changed |= splitAlloca(AI, S); 3466 3467 DEBUG(dbgs() << " Speculating PHIs\n"); 3468 while (!SpeculatablePHIs.empty()) 3469 speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); 3470 3471 DEBUG(dbgs() << " Speculating Selects\n"); 3472 while (!SpeculatableSelects.empty()) 3473 speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); 3474 3475 return Changed; 3476} 3477 3478/// \brief Delete the dead instructions accumulated in this run. 3479/// 3480/// Recursively deletes the dead instructions we've accumulated. This is done 3481/// at the very end to maximize locality of the recursive delete and to 3482/// minimize the problems of invalidated instruction pointers as such pointers 3483/// are used heavily in the intermediate stages of the algorithm. 3484/// 3485/// We also record the alloca instructions deleted here so that they aren't 3486/// subsequently handed to mem2reg to promote. 3487void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3488 while (!DeadInsts.empty()) { 3489 Instruction *I = DeadInsts.pop_back_val(); 3490 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3491 3492 I->replaceAllUsesWith(UndefValue::get(I->getType())); 3493 3494 for (Use &Operand : I->operands()) 3495 if (Instruction *U = dyn_cast<Instruction>(Operand)) { 3496 // Zero out the operand and see if it becomes trivially dead. 3497 Operand = 0; 3498 if (isInstructionTriviallyDead(U)) 3499 DeadInsts.insert(U); 3500 } 3501 3502 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3503 DeletedAllocas.insert(AI); 3504 3505 ++NumDeleted; 3506 I->eraseFromParent(); 3507 } 3508} 3509 3510static void enqueueUsersInWorklist(Instruction &I, 3511 SmallVectorImpl<Instruction *> &Worklist, 3512 SmallPtrSet<Instruction *, 8> &Visited) { 3513 for (User *U : I.users()) 3514 if (Visited.insert(cast<Instruction>(U))) 3515 Worklist.push_back(cast<Instruction>(U)); 3516} 3517 3518/// \brief Promote the allocas, using the best available technique. 3519/// 3520/// This attempts to promote whatever allocas have been identified as viable in 3521/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3522/// If there is a domtree available, we attempt to promote using the full power 3523/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3524/// based on the SSAUpdater utilities. This function returns whether any 3525/// promotion occurred. 3526bool SROA::promoteAllocas(Function &F) { 3527 if (PromotableAllocas.empty()) 3528 return false; 3529 3530 NumPromoted += PromotableAllocas.size(); 3531 3532 if (DT && !ForceSSAUpdater) { 3533 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3534 PromoteMemToReg(PromotableAllocas, *DT); 3535 PromotableAllocas.clear(); 3536 return true; 3537 } 3538 3539 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3540 SSAUpdater SSA; 3541 DIBuilder DIB(*F.getParent()); 3542 SmallVector<Instruction *, 64> Insts; 3543 3544 // We need a worklist to walk the uses of each alloca. 3545 SmallVector<Instruction *, 8> Worklist; 3546 SmallPtrSet<Instruction *, 8> Visited; 3547 SmallVector<Instruction *, 32> DeadInsts; 3548 3549 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3550 AllocaInst *AI = PromotableAllocas[Idx]; 3551 Insts.clear(); 3552 Worklist.clear(); 3553 Visited.clear(); 3554 3555 enqueueUsersInWorklist(*AI, Worklist, Visited); 3556 3557 while (!Worklist.empty()) { 3558 Instruction *I = Worklist.pop_back_val(); 3559 3560 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3561 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3562 // leading to them) here. Eventually it should use them to optimize the 3563 // scalar values produced. 3564 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3565 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3566 II->getIntrinsicID() == Intrinsic::lifetime_end); 3567 II->eraseFromParent(); 3568 continue; 3569 } 3570 3571 // Push the loads and stores we find onto the list. SROA will already 3572 // have validated that all loads and stores are viable candidates for 3573 // promotion. 3574 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 3575 assert(LI->getType() == AI->getAllocatedType()); 3576 Insts.push_back(LI); 3577 continue; 3578 } 3579 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 3580 assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); 3581 Insts.push_back(SI); 3582 continue; 3583 } 3584 3585 // For everything else, we know that only no-op bitcasts and GEPs will 3586 // make it this far, just recurse through them and recall them for later 3587 // removal. 3588 DeadInsts.push_back(I); 3589 enqueueUsersInWorklist(*I, Worklist, Visited); 3590 } 3591 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3592 while (!DeadInsts.empty()) 3593 DeadInsts.pop_back_val()->eraseFromParent(); 3594 AI->eraseFromParent(); 3595 } 3596 3597 PromotableAllocas.clear(); 3598 return true; 3599} 3600 3601bool SROA::runOnFunction(Function &F) { 3602 if (skipOptnoneFunction(F)) 3603 return false; 3604 3605 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3606 C = &F.getContext(); 3607 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 3608 if (!DLP) { 3609 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3610 return false; 3611 } 3612 DL = &DLP->getDataLayout(); 3613 DominatorTreeWrapperPass *DTWP = 3614 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 3615 DT = DTWP ? &DTWP->getDomTree() : 0; 3616 3617 BasicBlock &EntryBB = F.getEntryBlock(); 3618 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); 3619 I != E; ++I) 3620 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3621 Worklist.insert(AI); 3622 3623 bool Changed = false; 3624 // A set of deleted alloca instruction pointers which should be removed from 3625 // the list of promotable allocas. 3626 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3627 3628 do { 3629 while (!Worklist.empty()) { 3630 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3631 deleteDeadInstructions(DeletedAllocas); 3632 3633 // Remove the deleted allocas from various lists so that we don't try to 3634 // continue processing them. 3635 if (!DeletedAllocas.empty()) { 3636 auto IsInSet = [&](AllocaInst *AI) { 3637 return DeletedAllocas.count(AI); 3638 }; 3639 Worklist.remove_if(IsInSet); 3640 PostPromotionWorklist.remove_if(IsInSet); 3641 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3642 PromotableAllocas.end(), 3643 IsInSet), 3644 PromotableAllocas.end()); 3645 DeletedAllocas.clear(); 3646 } 3647 } 3648 3649 Changed |= promoteAllocas(F); 3650 3651 Worklist = PostPromotionWorklist; 3652 PostPromotionWorklist.clear(); 3653 } while (!Worklist.empty()); 3654 3655 return Changed; 3656} 3657 3658void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3659 if (RequiresDomTree) 3660 AU.addRequired<DominatorTreeWrapperPass>(); 3661 AU.setPreservesCFG(); 3662} 3663