SROA.cpp revision 11687d4982200fe99a820ea513100f237dfc1609
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/ADT/STLExtras.h" 29#include "llvm/ADT/SetVector.h" 30#include "llvm/ADT/SmallVector.h" 31#include "llvm/ADT/Statistic.h" 32#include "llvm/Analysis/Dominators.h" 33#include "llvm/Analysis/Loads.h" 34#include "llvm/Analysis/PtrUseVisitor.h" 35#include "llvm/Analysis/ValueTracking.h" 36#include "llvm/DIBuilder.h" 37#include "llvm/DebugInfo.h" 38#include "llvm/IR/Constants.h" 39#include "llvm/IR/DataLayout.h" 40#include "llvm/IR/DerivedTypes.h" 41#include "llvm/IR/Function.h" 42#include "llvm/IR/IRBuilder.h" 43#include "llvm/IR/Instructions.h" 44#include "llvm/IR/IntrinsicInst.h" 45#include "llvm/IR/LLVMContext.h" 46#include "llvm/IR/Operator.h" 47#include "llvm/InstVisitor.h" 48#include "llvm/Pass.h" 49#include "llvm/Support/CommandLine.h" 50#include "llvm/Support/Debug.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54#include "llvm/Transforms/Utils/Local.h" 55#include "llvm/Transforms/Utils/PromoteMemToReg.h" 56#include "llvm/Transforms/Utils/SSAUpdater.h" 57using namespace llvm; 58 59STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 60STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 61STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 62STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 63STATISTIC(NumDeleted, "Number of instructions deleted"); 64STATISTIC(NumVectorized, "Number of vectorized aggregates"); 65 66/// Hidden option to force the pass to not use DomTree and mem2reg, instead 67/// forming SSA values through the SSAUpdater infrastructure. 68static cl::opt<bool> 69ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 70 71namespace { 72/// \brief Alloca partitioning representation. 73/// 74/// This class represents a partitioning of an alloca into slices, and 75/// information about the nature of uses of each slice of the alloca. The goal 76/// is that this information is sufficient to decide if and how to split the 77/// alloca apart and replace slices with scalars. It is also intended that this 78/// structure can capture the relevant information needed both to decide about 79/// and to enact these transformations. 80class AllocaPartitioning { 81public: 82 /// \brief A common base class for representing a half-open byte range. 83 struct ByteRange { 84 /// \brief The beginning offset of the range. 85 uint64_t BeginOffset; 86 87 /// \brief The ending offset, not included in the range. 88 uint64_t EndOffset; 89 90 ByteRange() : BeginOffset(), EndOffset() {} 91 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 92 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 93 94 /// \brief Support for ordering ranges. 95 /// 96 /// This provides an ordering over ranges such that start offsets are 97 /// always increasing, and within equal start offsets, the end offsets are 98 /// decreasing. Thus the spanning range comes first in a cluster with the 99 /// same start position. 100 bool operator<(const ByteRange &RHS) const { 101 if (BeginOffset < RHS.BeginOffset) return true; 102 if (BeginOffset > RHS.BeginOffset) return false; 103 if (EndOffset > RHS.EndOffset) return true; 104 return false; 105 } 106 107 /// \brief Support comparison with a single offset to allow binary searches. 108 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 109 return LHS.BeginOffset < RHSOffset; 110 } 111 112 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 113 const ByteRange &RHS) { 114 return LHSOffset < RHS.BeginOffset; 115 } 116 117 bool operator==(const ByteRange &RHS) const { 118 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 119 } 120 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 121 }; 122 123 /// \brief A partition of an alloca. 124 /// 125 /// This structure represents a contiguous partition of the alloca. These are 126 /// formed by examining the uses of the alloca. During formation, they may 127 /// overlap but once an AllocaPartitioning is built, the Partitions within it 128 /// are all disjoint. 129 struct Partition : public ByteRange { 130 /// \brief Whether this partition is splittable into smaller partitions. 131 /// 132 /// We flag partitions as splittable when they are formed entirely due to 133 /// accesses by trivially splittable operations such as memset and memcpy. 134 bool IsSplittable; 135 136 /// \brief Test whether a partition has been marked as dead. 137 bool isDead() const { 138 if (BeginOffset == UINT64_MAX) { 139 assert(EndOffset == UINT64_MAX); 140 return true; 141 } 142 return false; 143 } 144 145 /// \brief Kill a partition. 146 /// This is accomplished by setting both its beginning and end offset to 147 /// the maximum possible value. 148 void kill() { 149 assert(!isDead() && "He's Dead, Jim!"); 150 BeginOffset = EndOffset = UINT64_MAX; 151 } 152 153 Partition() : ByteRange(), IsSplittable() {} 154 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 155 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 156 }; 157 158 /// \brief A particular use of a partition of the alloca. 159 /// 160 /// This structure is used to associate uses of a partition with it. They 161 /// mark the range of bytes which are referenced by a particular instruction, 162 /// and includes a handle to the user itself and the pointer value in use. 163 /// The bounds of these uses are determined by intersecting the bounds of the 164 /// memory use itself with a particular partition. As a consequence there is 165 /// intentionally overlap between various uses of the same partition. 166 struct PartitionUse : public ByteRange { 167 /// \brief The use in question. Provides access to both user and used value. 168 /// 169 /// Note that this may be null if the partition use is *dead*, that is, it 170 /// should be ignored. 171 Use *U; 172 173 PartitionUse() : ByteRange(), U() {} 174 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) 175 : ByteRange(BeginOffset, EndOffset), U(U) {} 176 }; 177 178 /// \brief Construct a partitioning of a particular alloca. 179 /// 180 /// Construction does most of the work for partitioning the alloca. This 181 /// performs the necessary walks of users and builds a partitioning from it. 182 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 183 184 /// \brief Test whether a pointer to the allocation escapes our analysis. 185 /// 186 /// If this is true, the partitioning is never fully built and should be 187 /// ignored. 188 bool isEscaped() const { return PointerEscapingInstr; } 189 190 /// \brief Support for iterating over the partitions. 191 /// @{ 192 typedef SmallVectorImpl<Partition>::iterator iterator; 193 iterator begin() { return Partitions.begin(); } 194 iterator end() { return Partitions.end(); } 195 196 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 197 const_iterator begin() const { return Partitions.begin(); } 198 const_iterator end() const { return Partitions.end(); } 199 /// @} 200 201 /// \brief Support for iterating over and manipulating a particular 202 /// partition's uses. 203 /// 204 /// The iteration support provided for uses is more limited, but also 205 /// includes some manipulation routines to support rewriting the uses of 206 /// partitions during SROA. 207 /// @{ 208 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 209 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 210 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 211 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 212 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 213 214 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 215 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 216 const_use_iterator use_begin(const_iterator I) const { 217 return Uses[I - begin()].begin(); 218 } 219 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 220 const_use_iterator use_end(const_iterator I) const { 221 return Uses[I - begin()].end(); 222 } 223 224 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 225 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 226 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 227 return Uses[PIdx][UIdx]; 228 } 229 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 230 return Uses[I - begin()][UIdx]; 231 } 232 233 void use_push_back(unsigned Idx, const PartitionUse &PU) { 234 Uses[Idx].push_back(PU); 235 } 236 void use_push_back(const_iterator I, const PartitionUse &PU) { 237 Uses[I - begin()].push_back(PU); 238 } 239 /// @} 240 241 /// \brief Allow iterating the dead users for this alloca. 242 /// 243 /// These are instructions which will never actually use the alloca as they 244 /// are outside the allocated range. They are safe to replace with undef and 245 /// delete. 246 /// @{ 247 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 248 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 249 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 250 /// @} 251 252 /// \brief Allow iterating the dead expressions referring to this alloca. 253 /// 254 /// These are operands which have cannot actually be used to refer to the 255 /// alloca as they are outside its range and the user doesn't correct for 256 /// that. These mostly consist of PHI node inputs and the like which we just 257 /// need to replace with undef. 258 /// @{ 259 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 260 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 261 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 262 /// @} 263 264 /// \brief MemTransferInst auxiliary data. 265 /// This struct provides some auxiliary data about memory transfer 266 /// intrinsics such as memcpy and memmove. These intrinsics can use two 267 /// different ranges within the same alloca, and provide other challenges to 268 /// correctly represent. We stash extra data to help us untangle this 269 /// after the partitioning is complete. 270 struct MemTransferOffsets { 271 /// The destination begin and end offsets when the destination is within 272 /// this alloca. If the end offset is zero the destination is not within 273 /// this alloca. 274 uint64_t DestBegin, DestEnd; 275 276 /// The source begin and end offsets when the source is within this alloca. 277 /// If the end offset is zero, the source is not within this alloca. 278 uint64_t SourceBegin, SourceEnd; 279 280 /// Flag for whether an alloca is splittable. 281 bool IsSplittable; 282 }; 283 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 284 return MemTransferInstData.lookup(&II); 285 } 286 287 /// \brief Map from a PHI or select operand back to a partition. 288 /// 289 /// When manipulating PHI nodes or selects, they can use more than one 290 /// partition of an alloca. We store a special mapping to allow finding the 291 /// partition referenced by each of these operands, if any. 292 iterator findPartitionForPHIOrSelectOperand(Use *U) { 293 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 294 = PHIOrSelectOpMap.find(U); 295 if (MapIt == PHIOrSelectOpMap.end()) 296 return end(); 297 298 return begin() + MapIt->second.first; 299 } 300 301 /// \brief Map from a PHI or select operand back to the specific use of 302 /// a partition. 303 /// 304 /// Similar to mapping these operands back to the partitions, this maps 305 /// directly to the use structure of that partition. 306 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 307 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 308 = PHIOrSelectOpMap.find(U); 309 assert(MapIt != PHIOrSelectOpMap.end()); 310 return Uses[MapIt->second.first].begin() + MapIt->second.second; 311 } 312 313 /// \brief Compute a common type among the uses of a particular partition. 314 /// 315 /// This routines walks all of the uses of a particular partition and tries 316 /// to find a common type between them. Untyped operations such as memset and 317 /// memcpy are ignored. 318 Type *getCommonType(iterator I) const; 319 320#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 321 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 322 void printUsers(raw_ostream &OS, const_iterator I, 323 StringRef Indent = " ") const; 324 void print(raw_ostream &OS) const; 325 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 326 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 327#endif 328 329private: 330 template <typename DerivedT, typename RetT = void> class BuilderBase; 331 class PartitionBuilder; 332 friend class AllocaPartitioning::PartitionBuilder; 333 class UseBuilder; 334 friend class AllocaPartitioning::UseBuilder; 335 336#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 337 /// \brief Handle to alloca instruction to simplify method interfaces. 338 AllocaInst &AI; 339#endif 340 341 /// \brief The instruction responsible for this alloca having no partitioning. 342 /// 343 /// When an instruction (potentially) escapes the pointer to the alloca, we 344 /// store a pointer to that here and abort trying to partition the alloca. 345 /// This will be null if the alloca is partitioned successfully. 346 Instruction *PointerEscapingInstr; 347 348 /// \brief The partitions of the alloca. 349 /// 350 /// We store a vector of the partitions over the alloca here. This vector is 351 /// sorted by increasing begin offset, and then by decreasing end offset. See 352 /// the Partition inner class for more details. Initially (during 353 /// construction) there are overlaps, but we form a disjoint sequence of 354 /// partitions while finishing construction and a fully constructed object is 355 /// expected to always have this as a disjoint space. 356 SmallVector<Partition, 8> Partitions; 357 358 /// \brief The uses of the partitions. 359 /// 360 /// This is essentially a mapping from each partition to a list of uses of 361 /// that partition. The mapping is done with a Uses vector that has the exact 362 /// same number of entries as the partition vector. Each entry is itself 363 /// a vector of the uses. 364 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 365 366 /// \brief Instructions which will become dead if we rewrite the alloca. 367 /// 368 /// Note that these are not separated by partition. This is because we expect 369 /// a partitioned alloca to be completely rewritten or not rewritten at all. 370 /// If rewritten, all these instructions can simply be removed and replaced 371 /// with undef as they come from outside of the allocated space. 372 SmallVector<Instruction *, 8> DeadUsers; 373 374 /// \brief Operands which will become dead if we rewrite the alloca. 375 /// 376 /// These are operands that in their particular use can be replaced with 377 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 378 /// to PHI nodes and the like. They aren't entirely dead (there might be 379 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 380 /// want to swap this particular input for undef to simplify the use lists of 381 /// the alloca. 382 SmallVector<Use *, 8> DeadOperands; 383 384 /// \brief The underlying storage for auxiliary memcpy and memset info. 385 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 386 387 /// \brief A side datastructure used when building up the partitions and uses. 388 /// 389 /// This mapping is only really used during the initial building of the 390 /// partitioning so that we can retain information about PHI and select nodes 391 /// processed. 392 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 393 394 /// \brief Auxiliary information for particular PHI or select operands. 395 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 396 397 /// \brief A utility routine called from the constructor. 398 /// 399 /// This does what it says on the tin. It is the key of the alloca partition 400 /// splitting and merging. After it is called we have the desired disjoint 401 /// collection of partitions. 402 void splitAndMergePartitions(); 403}; 404} 405 406static Value *foldSelectInst(SelectInst &SI) { 407 // If the condition being selected on is a constant or the same value is 408 // being selected between, fold the select. Yes this does (rarely) happen 409 // early on. 410 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 411 return SI.getOperand(1+CI->isZero()); 412 if (SI.getOperand(1) == SI.getOperand(2)) 413 return SI.getOperand(1); 414 415 return 0; 416} 417 418/// \brief Builder for the alloca partitioning. 419/// 420/// This class builds an alloca partitioning by recursively visiting the uses 421/// of an alloca and splitting the partitions for each load and store at each 422/// offset. 423class AllocaPartitioning::PartitionBuilder 424 : public PtrUseVisitor<PartitionBuilder> { 425 friend class PtrUseVisitor<PartitionBuilder>; 426 friend class InstVisitor<PartitionBuilder>; 427 typedef PtrUseVisitor<PartitionBuilder> Base; 428 429 const uint64_t AllocSize; 430 AllocaPartitioning &P; 431 432 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 433 434public: 435 PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) 436 : PtrUseVisitor<PartitionBuilder>(DL), 437 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), 438 P(P) {} 439 440private: 441 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 442 bool IsSplittable = false) { 443 // Completely skip uses which have a zero size or start either before or 444 // past the end of the allocation. 445 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { 446 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 447 << " which has zero size or starts outside of the " 448 << AllocSize << " byte alloca:\n" 449 << " alloca: " << P.AI << "\n" 450 << " use: " << I << "\n"); 451 return; 452 } 453 454 uint64_t BeginOffset = Offset.getZExtValue(); 455 uint64_t EndOffset = BeginOffset + Size; 456 457 // Clamp the end offset to the end of the allocation. Note that this is 458 // formulated to handle even the case where "BeginOffset + Size" overflows. 459 // NOTE! This may appear superficially to be something we could ignore 460 // entirely, but that is not so! There may be PHI-node uses where some 461 // instructions are dead but not others. We can't completely ignore the 462 // PHI node, and so have to record at least the information here. 463 assert(AllocSize >= BeginOffset); // Established above. 464 if (Size > AllocSize - BeginOffset) { 465 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 466 << " to remain within the " << AllocSize << " byte alloca:\n" 467 << " alloca: " << P.AI << "\n" 468 << " use: " << I << "\n"); 469 EndOffset = AllocSize; 470 } 471 472 Partition New(BeginOffset, EndOffset, IsSplittable); 473 P.Partitions.push_back(New); 474 } 475 476 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 477 bool IsVolatile) { 478 uint64_t Size = DL.getTypeStoreSize(Ty); 479 480 // If this memory access can be shown to *statically* extend outside the 481 // bounds of of the allocation, it's behavior is undefined, so simply 482 // ignore it. Note that this is more strict than the generic clamping 483 // behavior of insertUse. We also try to handle cases which might run the 484 // risk of overflow. 485 // FIXME: We should instead consider the pointer to have escaped if this 486 // function is being instrumented for addressing bugs or race conditions. 487 if (Offset.isNegative() || Size > AllocSize || 488 Offset.ugt(AllocSize - Size)) { 489 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 490 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 491 << " which extends past the end of the " << AllocSize 492 << " byte alloca:\n" 493 << " alloca: " << P.AI << "\n" 494 << " use: " << I << "\n"); 495 return; 496 } 497 498 // We allow splitting of loads and stores where the type is an integer type 499 // and which cover the entire alloca. Such integer loads and stores 500 // often require decomposition into fine grained loads and stores. 501 bool IsSplittable = false; 502 if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) 503 IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; 504 505 insertUse(I, Offset, Size, IsSplittable); 506 } 507 508 void visitLoadInst(LoadInst &LI) { 509 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 510 "All simple FCA loads should have been pre-split"); 511 512 if (!IsOffsetKnown) 513 return PI.setAborted(&LI); 514 515 return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); 516 } 517 518 void visitStoreInst(StoreInst &SI) { 519 Value *ValOp = SI.getValueOperand(); 520 if (ValOp == *U) 521 return PI.setEscapedAndAborted(&SI); 522 if (!IsOffsetKnown) 523 return PI.setAborted(&SI); 524 525 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 526 "All simple FCA stores should have been pre-split"); 527 handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); 528 } 529 530 531 void visitMemSetInst(MemSetInst &II) { 532 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 533 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 534 if ((Length && Length->getValue() == 0) || 535 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 536 // Zero-length mem transfer intrinsics can be ignored entirely. 537 return; 538 539 if (!IsOffsetKnown) 540 return PI.setAborted(&II); 541 542 insertUse(II, Offset, 543 Length ? Length->getLimitedValue() 544 : AllocSize - Offset.getLimitedValue(), 545 (bool)Length); 546 } 547 548 void visitMemTransferInst(MemTransferInst &II) { 549 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 550 if ((Length && Length->getValue() == 0) || 551 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 552 // Zero-length mem transfer intrinsics can be ignored entirely. 553 return; 554 555 if (!IsOffsetKnown) 556 return PI.setAborted(&II); 557 558 uint64_t RawOffset = Offset.getLimitedValue(); 559 uint64_t Size = Length ? Length->getLimitedValue() 560 : AllocSize - RawOffset; 561 562 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 563 564 // Only intrinsics with a constant length can be split. 565 Offsets.IsSplittable = Length; 566 567 if (*U == II.getRawDest()) { 568 Offsets.DestBegin = RawOffset; 569 Offsets.DestEnd = RawOffset + Size; 570 } 571 if (*U == II.getRawSource()) { 572 Offsets.SourceBegin = RawOffset; 573 Offsets.SourceEnd = RawOffset + Size; 574 } 575 576 // If we have set up end offsets for both the source and the destination, 577 // we have found both sides of this transfer pointing at the same alloca. 578 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 579 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 580 unsigned PrevIdx = MemTransferPartitionMap[&II]; 581 582 // Check if the begin offsets match and this is a non-volatile transfer. 583 // In that case, we can completely elide the transfer. 584 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 585 P.Partitions[PrevIdx].kill(); 586 return; 587 } 588 589 // Otherwise we have an offset transfer within the same alloca. We can't 590 // split those. 591 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 592 } else if (SeenBothEnds) { 593 // Handle the case where this exact use provides both ends of the 594 // operation. 595 assert(II.getRawDest() == II.getRawSource()); 596 597 // For non-volatile transfers this is a no-op. 598 if (!II.isVolatile()) 599 return; 600 601 // Otherwise just suppress splitting. 602 Offsets.IsSplittable = false; 603 } 604 605 606 // Insert the use now that we've fixed up the splittable nature. 607 insertUse(II, Offset, Size, Offsets.IsSplittable); 608 609 // Setup the mapping from intrinsic to partition of we've not seen both 610 // ends of this transfer. 611 if (!SeenBothEnds) { 612 unsigned NewIdx = P.Partitions.size() - 1; 613 bool Inserted 614 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 615 assert(Inserted && 616 "Already have intrinsic in map but haven't seen both ends"); 617 (void)Inserted; 618 } 619 } 620 621 // Disable SRoA for any intrinsics except for lifetime invariants. 622 // FIXME: What about debug intrinsics? This matches old behavior, but 623 // doesn't make sense. 624 void visitIntrinsicInst(IntrinsicInst &II) { 625 if (!IsOffsetKnown) 626 return PI.setAborted(&II); 627 628 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 629 II.getIntrinsicID() == Intrinsic::lifetime_end) { 630 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 631 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 632 Length->getLimitedValue()); 633 insertUse(II, Offset, Size, true); 634 return; 635 } 636 637 Base::visitIntrinsicInst(II); 638 } 639 640 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 641 // We consider any PHI or select that results in a direct load or store of 642 // the same offset to be a viable use for partitioning purposes. These uses 643 // are considered unsplittable and the size is the maximum loaded or stored 644 // size. 645 SmallPtrSet<Instruction *, 4> Visited; 646 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 647 Visited.insert(Root); 648 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 649 // If there are no loads or stores, the access is dead. We mark that as 650 // a size zero access. 651 Size = 0; 652 do { 653 Instruction *I, *UsedI; 654 llvm::tie(UsedI, I) = Uses.pop_back_val(); 655 656 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 657 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 658 continue; 659 } 660 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 661 Value *Op = SI->getOperand(0); 662 if (Op == UsedI) 663 return SI; 664 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 665 continue; 666 } 667 668 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 669 if (!GEP->hasAllZeroIndices()) 670 return GEP; 671 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 672 !isa<SelectInst>(I)) { 673 return I; 674 } 675 676 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 677 ++UI) 678 if (Visited.insert(cast<Instruction>(*UI))) 679 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 680 } while (!Uses.empty()); 681 682 return 0; 683 } 684 685 void visitPHINode(PHINode &PN) { 686 if (PN.use_empty()) 687 return; 688 if (!IsOffsetKnown) 689 return PI.setAborted(&PN); 690 691 // See if we already have computed info on this node. 692 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 693 if (PHIInfo.first) { 694 PHIInfo.second = true; 695 insertUse(PN, Offset, PHIInfo.first); 696 return; 697 } 698 699 // Check for an unsafe use of the PHI node. 700 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 701 return PI.setAborted(UnsafeI); 702 703 insertUse(PN, Offset, PHIInfo.first); 704 } 705 706 void visitSelectInst(SelectInst &SI) { 707 if (SI.use_empty()) 708 return; 709 if (Value *Result = foldSelectInst(SI)) { 710 if (Result == *U) 711 // If the result of the constant fold will be the pointer, recurse 712 // through the select as if we had RAUW'ed it. 713 enqueueUsers(SI); 714 715 return; 716 } 717 if (!IsOffsetKnown) 718 return PI.setAborted(&SI); 719 720 // See if we already have computed info on this node. 721 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 722 if (SelectInfo.first) { 723 SelectInfo.second = true; 724 insertUse(SI, Offset, SelectInfo.first); 725 return; 726 } 727 728 // Check for an unsafe use of the PHI node. 729 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 730 return PI.setAborted(UnsafeI); 731 732 insertUse(SI, Offset, SelectInfo.first); 733 } 734 735 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 736 void visitInstruction(Instruction &I) { 737 PI.setAborted(&I); 738 } 739}; 740 741/// \brief Use adder for the alloca partitioning. 742/// 743/// This class adds the uses of an alloca to all of the partitions which they 744/// use. For splittable partitions, this can end up doing essentially a linear 745/// walk of the partitions, but the number of steps remains bounded by the 746/// total result instruction size: 747/// - The number of partitions is a result of the number unsplittable 748/// instructions using the alloca. 749/// - The number of users of each partition is at worst the total number of 750/// splittable instructions using the alloca. 751/// Thus we will produce N * M instructions in the end, where N are the number 752/// of unsplittable uses and M are the number of splittable. This visitor does 753/// the exact same number of updates to the partitioning. 754/// 755/// In the more common case, this visitor will leverage the fact that the 756/// partition space is pre-sorted, and do a logarithmic search for the 757/// partition needed, making the total visit a classical ((N + M) * log(N)) 758/// complexity operation. 759class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { 760 friend class PtrUseVisitor<UseBuilder>; 761 friend class InstVisitor<UseBuilder>; 762 typedef PtrUseVisitor<UseBuilder> Base; 763 764 const uint64_t AllocSize; 765 AllocaPartitioning &P; 766 767 /// \brief Set to de-duplicate dead instructions found in the use walk. 768 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 769 770public: 771 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 772 : PtrUseVisitor<UseBuilder>(TD), 773 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 774 P(P) {} 775 776private: 777 void markAsDead(Instruction &I) { 778 if (VisitedDeadInsts.insert(&I)) 779 P.DeadUsers.push_back(&I); 780 } 781 782 void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { 783 // If the use has a zero size or extends outside of the allocation, record 784 // it as a dead use for elimination later. 785 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) 786 return markAsDead(User); 787 788 uint64_t BeginOffset = Offset.getZExtValue(); 789 uint64_t EndOffset = BeginOffset + Size; 790 791 // Clamp the end offset to the end of the allocation. Note that this is 792 // formulated to handle even the case where "BeginOffset + Size" overflows. 793 assert(AllocSize >= BeginOffset); // Established above. 794 if (Size > AllocSize - BeginOffset) 795 EndOffset = AllocSize; 796 797 // NB: This only works if we have zero overlapping partitions. 798 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 799 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 800 B = llvm::prior(B); 801 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 802 ++I) { 803 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 804 std::min(I->EndOffset, EndOffset), U); 805 P.use_push_back(I, NewPU); 806 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 807 P.PHIOrSelectOpMap[U] 808 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 809 } 810 } 811 812 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) { 813 uint64_t Size = DL.getTypeStoreSize(Ty); 814 815 // If this memory access can be shown to *statically* extend outside the 816 // bounds of of the allocation, it's behavior is undefined, so simply 817 // ignore it. Note that this is more strict than the generic clamping 818 // behavior of insertUse. 819 if (Offset.isNegative() || Size > AllocSize || 820 Offset.ugt(AllocSize - Size)) 821 return markAsDead(I); 822 823 insertUse(I, Offset, Size); 824 } 825 826 void visitBitCastInst(BitCastInst &BC) { 827 if (BC.use_empty()) 828 return markAsDead(BC); 829 830 return Base::visitBitCastInst(BC); 831 } 832 833 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 834 if (GEPI.use_empty()) 835 return markAsDead(GEPI); 836 837 return Base::visitGetElementPtrInst(GEPI); 838 } 839 840 void visitLoadInst(LoadInst &LI) { 841 assert(IsOffsetKnown); 842 handleLoadOrStore(LI.getType(), LI, Offset); 843 } 844 845 void visitStoreInst(StoreInst &SI) { 846 assert(IsOffsetKnown); 847 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 848 } 849 850 void visitMemSetInst(MemSetInst &II) { 851 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 852 if ((Length && Length->getValue() == 0) || 853 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 854 return markAsDead(II); 855 856 assert(IsOffsetKnown); 857 insertUse(II, Offset, Length ? Length->getLimitedValue() 858 : AllocSize - Offset.getLimitedValue()); 859 } 860 861 void visitMemTransferInst(MemTransferInst &II) { 862 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 863 if ((Length && Length->getValue() == 0) || 864 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 865 return markAsDead(II); 866 867 assert(IsOffsetKnown); 868 uint64_t Size = Length ? Length->getLimitedValue() 869 : AllocSize - Offset.getLimitedValue(); 870 871 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 872 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 873 Offsets.DestBegin == Offsets.SourceBegin) 874 return markAsDead(II); // Skip identity transfers without side-effects. 875 876 insertUse(II, Offset, Size); 877 } 878 879 void visitIntrinsicInst(IntrinsicInst &II) { 880 assert(IsOffsetKnown); 881 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 882 II.getIntrinsicID() == Intrinsic::lifetime_end); 883 884 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 885 insertUse(II, Offset, std::min(Length->getLimitedValue(), 886 AllocSize - Offset.getLimitedValue())); 887 } 888 889 void insertPHIOrSelect(Instruction &User, const APInt &Offset) { 890 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 891 892 // For PHI and select operands outside the alloca, we can't nuke the entire 893 // phi or select -- the other side might still be relevant, so we special 894 // case them here and use a separate structure to track the operands 895 // themselves which should be replaced with undef. 896 if ((Offset.isNegative() && Offset.uge(Size)) || 897 (!Offset.isNegative() && Offset.uge(AllocSize))) { 898 P.DeadOperands.push_back(U); 899 return; 900 } 901 902 insertUse(User, Offset, Size); 903 } 904 905 void visitPHINode(PHINode &PN) { 906 if (PN.use_empty()) 907 return markAsDead(PN); 908 909 assert(IsOffsetKnown); 910 insertPHIOrSelect(PN, Offset); 911 } 912 913 void visitSelectInst(SelectInst &SI) { 914 if (SI.use_empty()) 915 return markAsDead(SI); 916 917 if (Value *Result = foldSelectInst(SI)) { 918 if (Result == *U) 919 // If the result of the constant fold will be the pointer, recurse 920 // through the select as if we had RAUW'ed it. 921 enqueueUsers(SI); 922 else 923 // Otherwise the operand to the select is dead, and we can replace it 924 // with undef. 925 P.DeadOperands.push_back(U); 926 927 return; 928 } 929 930 assert(IsOffsetKnown); 931 insertPHIOrSelect(SI, Offset); 932 } 933 934 /// \brief Unreachable, we've already visited the alloca once. 935 void visitInstruction(Instruction &I) { 936 llvm_unreachable("Unhandled instruction in use builder."); 937 } 938}; 939 940void AllocaPartitioning::splitAndMergePartitions() { 941 size_t NumDeadPartitions = 0; 942 943 // Track the range of splittable partitions that we pass when accumulating 944 // overlapping unsplittable partitions. 945 uint64_t SplitEndOffset = 0ull; 946 947 Partition New(0ull, 0ull, false); 948 949 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 950 ++j; 951 952 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 953 assert(New.BeginOffset == New.EndOffset); 954 New = Partitions[i]; 955 } else { 956 assert(New.IsSplittable); 957 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 958 } 959 assert(New.BeginOffset != New.EndOffset); 960 961 // Scan the overlapping partitions. 962 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 963 // If the new partition we are forming is splittable, stop at the first 964 // unsplittable partition. 965 if (New.IsSplittable && !Partitions[j].IsSplittable) 966 break; 967 968 // Grow the new partition to include any equally splittable range. 'j' is 969 // always equally splittable when New is splittable, but when New is not 970 // splittable, we may subsume some (or part of some) splitable partition 971 // without growing the new one. 972 if (New.IsSplittable == Partitions[j].IsSplittable) { 973 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 974 } else { 975 assert(!New.IsSplittable); 976 assert(Partitions[j].IsSplittable); 977 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 978 } 979 980 Partitions[j].kill(); 981 ++NumDeadPartitions; 982 ++j; 983 } 984 985 // If the new partition is splittable, chop off the end as soon as the 986 // unsplittable subsequent partition starts and ensure we eventually cover 987 // the splittable area. 988 if (j != e && New.IsSplittable) { 989 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 990 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 991 } 992 993 // Add the new partition if it differs from the original one and is 994 // non-empty. We can end up with an empty partition here if it was 995 // splittable but there is an unsplittable one that starts at the same 996 // offset. 997 if (New != Partitions[i]) { 998 if (New.BeginOffset != New.EndOffset) 999 Partitions.push_back(New); 1000 // Mark the old one for removal. 1001 Partitions[i].kill(); 1002 ++NumDeadPartitions; 1003 } 1004 1005 New.BeginOffset = New.EndOffset; 1006 if (!New.IsSplittable) { 1007 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1008 if (j != e && !Partitions[j].IsSplittable) 1009 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1010 New.IsSplittable = true; 1011 // If there is a trailing splittable partition which won't be fused into 1012 // the next splittable partition go ahead and add it onto the partitions 1013 // list. 1014 if (New.BeginOffset < New.EndOffset && 1015 (j == e || !Partitions[j].IsSplittable || 1016 New.EndOffset < Partitions[j].BeginOffset)) { 1017 Partitions.push_back(New); 1018 New.BeginOffset = New.EndOffset = 0ull; 1019 } 1020 } 1021 } 1022 1023 // Re-sort the partitions now that they have been split and merged into 1024 // disjoint set of partitions. Also remove any of the dead partitions we've 1025 // replaced in the process. 1026 std::sort(Partitions.begin(), Partitions.end()); 1027 if (NumDeadPartitions) { 1028 assert(Partitions.back().isDead()); 1029 assert((ptrdiff_t)NumDeadPartitions == 1030 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1031 } 1032 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1033} 1034 1035AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1036 : 1037#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1038 AI(AI), 1039#endif 1040 PointerEscapingInstr(0) { 1041 PartitionBuilder PB(TD, AI, *this); 1042 PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1043 if (PtrI.isEscaped() || PtrI.isAborted()) { 1044 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1045 // possibly by just storing the PtrInfo in the AllocaPartitioning. 1046 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1047 : PtrI.getAbortingInst(); 1048 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1049 return; 1050 } 1051 1052 // Sort the uses. This arranges for the offsets to be in ascending order, 1053 // and the sizes to be in descending order. 1054 std::sort(Partitions.begin(), Partitions.end()); 1055 1056 // Remove any partitions from the back which are marked as dead. 1057 while (!Partitions.empty() && Partitions.back().isDead()) 1058 Partitions.pop_back(); 1059 1060 if (Partitions.size() > 1) { 1061 // Intersect splittability for all partitions with equal offsets and sizes. 1062 // Then remove all but the first so that we have a sequence of non-equal but 1063 // potentially overlapping partitions. 1064 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1065 I = J) { 1066 ++J; 1067 while (J != E && *I == *J) { 1068 I->IsSplittable &= J->IsSplittable; 1069 ++J; 1070 } 1071 } 1072 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1073 Partitions.end()); 1074 1075 // Split splittable and merge unsplittable partitions into a disjoint set 1076 // of partitions over the used space of the allocation. 1077 splitAndMergePartitions(); 1078 } 1079 1080 // Now build up the user lists for each of these disjoint partitions by 1081 // re-walking the recursive users of the alloca. 1082 Uses.resize(Partitions.size()); 1083 UseBuilder UB(TD, AI, *this); 1084 PtrI = UB.visitPtr(AI); 1085 assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); 1086 assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); 1087} 1088 1089Type *AllocaPartitioning::getCommonType(iterator I) const { 1090 Type *Ty = 0; 1091 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1092 if (!UI->U) 1093 continue; // Skip dead uses. 1094 if (isa<IntrinsicInst>(*UI->U->getUser())) 1095 continue; 1096 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1097 continue; 1098 1099 Type *UserTy = 0; 1100 if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) 1101 UserTy = LI->getType(); 1102 else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) 1103 UserTy = SI->getValueOperand()->getType(); 1104 else 1105 return 0; // Bail if we have weird uses. 1106 1107 if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { 1108 // If the type is larger than the partition, skip it. We only encounter 1109 // this for split integer operations where we want to use the type of the 1110 // entity causing the split. 1111 if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) 1112 continue; 1113 1114 // If we have found an integer type use covering the alloca, use that 1115 // regardless of the other types, as integers are often used for a "bucket 1116 // of bits" type. 1117 return ITy; 1118 } 1119 1120 if (Ty && Ty != UserTy) 1121 return 0; 1122 1123 Ty = UserTy; 1124 } 1125 return Ty; 1126} 1127 1128#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1129 1130void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1131 StringRef Indent) const { 1132 OS << Indent << "partition #" << (I - begin()) 1133 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1134 << (I->IsSplittable ? " (splittable)" : "") 1135 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1136 << "\n"; 1137} 1138 1139void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1140 StringRef Indent) const { 1141 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1142 if (!UI->U) 1143 continue; // Skip dead uses. 1144 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1145 << "used by: " << *UI->U->getUser() << "\n"; 1146 if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { 1147 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1148 bool IsDest; 1149 if (!MTO.IsSplittable) 1150 IsDest = UI->BeginOffset == MTO.DestBegin; 1151 else 1152 IsDest = MTO.DestBegin != 0u; 1153 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1154 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1155 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1156 } 1157 } 1158} 1159 1160void AllocaPartitioning::print(raw_ostream &OS) const { 1161 if (PointerEscapingInstr) { 1162 OS << "No partitioning for alloca: " << AI << "\n" 1163 << " A pointer to this alloca escaped by:\n" 1164 << " " << *PointerEscapingInstr << "\n"; 1165 return; 1166 } 1167 1168 OS << "Partitioning of alloca: " << AI << "\n"; 1169 for (const_iterator I = begin(), E = end(); I != E; ++I) { 1170 print(OS, I); 1171 printUsers(OS, I); 1172 } 1173} 1174 1175void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1176void AllocaPartitioning::dump() const { print(dbgs()); } 1177 1178#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1179 1180 1181namespace { 1182/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1183/// 1184/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1185/// the loads and stores of an alloca instruction, as well as updating its 1186/// debug information. This is used when a domtree is unavailable and thus 1187/// mem2reg in its full form can't be used to handle promotion of allocas to 1188/// scalar values. 1189class AllocaPromoter : public LoadAndStorePromoter { 1190 AllocaInst &AI; 1191 DIBuilder &DIB; 1192 1193 SmallVector<DbgDeclareInst *, 4> DDIs; 1194 SmallVector<DbgValueInst *, 4> DVIs; 1195 1196public: 1197 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1198 AllocaInst &AI, DIBuilder &DIB) 1199 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1200 1201 void run(const SmallVectorImpl<Instruction*> &Insts) { 1202 // Remember which alloca we're promoting (for isInstInList). 1203 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1204 for (Value::use_iterator UI = DebugNode->use_begin(), 1205 UE = DebugNode->use_end(); 1206 UI != UE; ++UI) 1207 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1208 DDIs.push_back(DDI); 1209 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1210 DVIs.push_back(DVI); 1211 } 1212 1213 LoadAndStorePromoter::run(Insts); 1214 AI.eraseFromParent(); 1215 while (!DDIs.empty()) 1216 DDIs.pop_back_val()->eraseFromParent(); 1217 while (!DVIs.empty()) 1218 DVIs.pop_back_val()->eraseFromParent(); 1219 } 1220 1221 virtual bool isInstInList(Instruction *I, 1222 const SmallVectorImpl<Instruction*> &Insts) const { 1223 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1224 return LI->getOperand(0) == &AI; 1225 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1226 } 1227 1228 virtual void updateDebugInfo(Instruction *Inst) const { 1229 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1230 E = DDIs.end(); I != E; ++I) { 1231 DbgDeclareInst *DDI = *I; 1232 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1233 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1234 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1235 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1236 } 1237 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1238 E = DVIs.end(); I != E; ++I) { 1239 DbgValueInst *DVI = *I; 1240 Value *Arg = 0; 1241 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1242 // If an argument is zero extended then use argument directly. The ZExt 1243 // may be zapped by an optimization pass in future. 1244 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1245 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1246 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1247 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1248 if (!Arg) 1249 Arg = SI->getOperand(0); 1250 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1251 Arg = LI->getOperand(0); 1252 } else { 1253 continue; 1254 } 1255 Instruction *DbgVal = 1256 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1257 Inst); 1258 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1259 } 1260 } 1261}; 1262} // end anon namespace 1263 1264 1265namespace { 1266/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1267/// 1268/// This pass takes allocations which can be completely analyzed (that is, they 1269/// don't escape) and tries to turn them into scalar SSA values. There are 1270/// a few steps to this process. 1271/// 1272/// 1) It takes allocations of aggregates and analyzes the ways in which they 1273/// are used to try to split them into smaller allocations, ideally of 1274/// a single scalar data type. It will split up memcpy and memset accesses 1275/// as necessary and try to isolate individual scalar accesses. 1276/// 2) It will transform accesses into forms which are suitable for SSA value 1277/// promotion. This can be replacing a memset with a scalar store of an 1278/// integer value, or it can involve speculating operations on a PHI or 1279/// select to be a PHI or select of the results. 1280/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1281/// onto insert and extract operations on a vector value, and convert them to 1282/// this form. By doing so, it will enable promotion of vector aggregates to 1283/// SSA vector values. 1284class SROA : public FunctionPass { 1285 const bool RequiresDomTree; 1286 1287 LLVMContext *C; 1288 const DataLayout *TD; 1289 DominatorTree *DT; 1290 1291 /// \brief Worklist of alloca instructions to simplify. 1292 /// 1293 /// Each alloca in the function is added to this. Each new alloca formed gets 1294 /// added to it as well to recursively simplify unless that alloca can be 1295 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1296 /// the one being actively rewritten, we add it back onto the list if not 1297 /// already present to ensure it is re-visited. 1298 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1299 1300 /// \brief A collection of instructions to delete. 1301 /// We try to batch deletions to simplify code and make things a bit more 1302 /// efficient. 1303 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 1304 1305 /// \brief Post-promotion worklist. 1306 /// 1307 /// Sometimes we discover an alloca which has a high probability of becoming 1308 /// viable for SROA after a round of promotion takes place. In those cases, 1309 /// the alloca is enqueued here for re-processing. 1310 /// 1311 /// Note that we have to be very careful to clear allocas out of this list in 1312 /// the event they are deleted. 1313 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1314 1315 /// \brief A collection of alloca instructions we can directly promote. 1316 std::vector<AllocaInst *> PromotableAllocas; 1317 1318public: 1319 SROA(bool RequiresDomTree = true) 1320 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1321 C(0), TD(0), DT(0) { 1322 initializeSROAPass(*PassRegistry::getPassRegistry()); 1323 } 1324 bool runOnFunction(Function &F); 1325 void getAnalysisUsage(AnalysisUsage &AU) const; 1326 1327 const char *getPassName() const { return "SROA"; } 1328 static char ID; 1329 1330private: 1331 friend class PHIOrSelectSpeculator; 1332 friend class AllocaPartitionRewriter; 1333 friend class AllocaPartitionVectorRewriter; 1334 1335 bool rewriteAllocaPartition(AllocaInst &AI, 1336 AllocaPartitioning &P, 1337 AllocaPartitioning::iterator PI); 1338 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1339 bool runOnAlloca(AllocaInst &AI); 1340 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1341 bool promoteAllocas(Function &F); 1342}; 1343} 1344 1345char SROA::ID = 0; 1346 1347FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1348 return new SROA(RequiresDomTree); 1349} 1350 1351INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1352 false, false) 1353INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1354INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1355 false, false) 1356 1357namespace { 1358/// \brief Visitor to speculate PHIs and Selects where possible. 1359class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1360 // Befriend the base class so it can delegate to private visit methods. 1361 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1362 1363 const DataLayout &TD; 1364 AllocaPartitioning &P; 1365 SROA &Pass; 1366 1367public: 1368 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1369 : TD(TD), P(P), Pass(Pass) {} 1370 1371 /// \brief Visit the users of an alloca partition and rewrite them. 1372 void visitUsers(AllocaPartitioning::const_iterator PI) { 1373 // Note that we need to use an index here as the underlying vector of uses 1374 // may be grown during speculation. However, we never need to re-visit the 1375 // new uses, and so we can use the initial size bound. 1376 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1377 const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); 1378 if (!PU.U) 1379 continue; // Skip dead use. 1380 1381 visit(cast<Instruction>(PU.U->getUser())); 1382 } 1383 } 1384 1385private: 1386 // By default, skip this instruction. 1387 void visitInstruction(Instruction &I) {} 1388 1389 /// PHI instructions that use an alloca and are subsequently loaded can be 1390 /// rewritten to load both input pointers in the pred blocks and then PHI the 1391 /// results, allowing the load of the alloca to be promoted. 1392 /// From this: 1393 /// %P2 = phi [i32* %Alloca, i32* %Other] 1394 /// %V = load i32* %P2 1395 /// to: 1396 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1397 /// ... 1398 /// %V2 = load i32* %Other 1399 /// ... 1400 /// %V = phi [i32 %V1, i32 %V2] 1401 /// 1402 /// We can do this to a select if its only uses are loads and if the operands 1403 /// to the select can be loaded unconditionally. 1404 /// 1405 /// FIXME: This should be hoisted into a generic utility, likely in 1406 /// Transforms/Util/Local.h 1407 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1408 // For now, we can only do this promotion if the load is in the same block 1409 // as the PHI, and if there are no stores between the phi and load. 1410 // TODO: Allow recursive phi users. 1411 // TODO: Allow stores. 1412 BasicBlock *BB = PN.getParent(); 1413 unsigned MaxAlign = 0; 1414 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1415 UI != UE; ++UI) { 1416 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1417 if (LI == 0 || !LI->isSimple()) return false; 1418 1419 // For now we only allow loads in the same block as the PHI. This is 1420 // a common case that happens when instcombine merges two loads through 1421 // a PHI. 1422 if (LI->getParent() != BB) return false; 1423 1424 // Ensure that there are no instructions between the PHI and the load that 1425 // could store. 1426 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1427 if (BBI->mayWriteToMemory()) 1428 return false; 1429 1430 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1431 Loads.push_back(LI); 1432 } 1433 1434 // We can only transform this if it is safe to push the loads into the 1435 // predecessor blocks. The only thing to watch out for is that we can't put 1436 // a possibly trapping load in the predecessor if it is a critical edge. 1437 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1438 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1439 Value *InVal = PN.getIncomingValue(Idx); 1440 1441 // If the value is produced by the terminator of the predecessor (an 1442 // invoke) or it has side-effects, there is no valid place to put a load 1443 // in the predecessor. 1444 if (TI == InVal || TI->mayHaveSideEffects()) 1445 return false; 1446 1447 // If the predecessor has a single successor, then the edge isn't 1448 // critical. 1449 if (TI->getNumSuccessors() == 1) 1450 continue; 1451 1452 // If this pointer is always safe to load, or if we can prove that there 1453 // is already a load in the block, then we can move the load to the pred 1454 // block. 1455 if (InVal->isDereferenceablePointer() || 1456 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1457 continue; 1458 1459 return false; 1460 } 1461 1462 return true; 1463 } 1464 1465 void visitPHINode(PHINode &PN) { 1466 DEBUG(dbgs() << " original: " << PN << "\n"); 1467 1468 SmallVector<LoadInst *, 4> Loads; 1469 if (!isSafePHIToSpeculate(PN, Loads)) 1470 return; 1471 1472 assert(!Loads.empty()); 1473 1474 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1475 IRBuilder<> PHIBuilder(&PN); 1476 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1477 PN.getName() + ".sroa.speculated"); 1478 1479 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1480 // matter which one we get and if any differ. 1481 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1482 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1483 unsigned Align = SomeLoad->getAlignment(); 1484 1485 // Rewrite all loads of the PN to use the new PHI. 1486 do { 1487 LoadInst *LI = Loads.pop_back_val(); 1488 LI->replaceAllUsesWith(NewPN); 1489 Pass.DeadInsts.insert(LI); 1490 } while (!Loads.empty()); 1491 1492 // Inject loads into all of the pred blocks. 1493 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1494 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1495 TerminatorInst *TI = Pred->getTerminator(); 1496 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1497 Value *InVal = PN.getIncomingValue(Idx); 1498 IRBuilder<> PredBuilder(TI); 1499 1500 LoadInst *Load 1501 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1502 Pred->getName())); 1503 ++NumLoadsSpeculated; 1504 Load->setAlignment(Align); 1505 if (TBAATag) 1506 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1507 NewPN->addIncoming(Load, Pred); 1508 1509 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1510 if (!Ptr) 1511 // No uses to rewrite. 1512 continue; 1513 1514 // Try to lookup and rewrite any partition uses corresponding to this phi 1515 // input. 1516 AllocaPartitioning::iterator PI 1517 = P.findPartitionForPHIOrSelectOperand(InUse); 1518 if (PI == P.end()) 1519 continue; 1520 1521 // Replace the Use in the PartitionUse for this operand with the Use 1522 // inside the load. 1523 AllocaPartitioning::use_iterator UI 1524 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1525 assert(isa<PHINode>(*UI->U->getUser())); 1526 UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); 1527 } 1528 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1529 } 1530 1531 /// Select instructions that use an alloca and are subsequently loaded can be 1532 /// rewritten to load both input pointers and then select between the result, 1533 /// allowing the load of the alloca to be promoted. 1534 /// From this: 1535 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1536 /// %V = load i32* %P2 1537 /// to: 1538 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1539 /// %V2 = load i32* %Other 1540 /// %V = select i1 %cond, i32 %V1, i32 %V2 1541 /// 1542 /// We can do this to a select if its only uses are loads and if the operand 1543 /// to the select can be loaded unconditionally. 1544 bool isSafeSelectToSpeculate(SelectInst &SI, 1545 SmallVectorImpl<LoadInst *> &Loads) { 1546 Value *TValue = SI.getTrueValue(); 1547 Value *FValue = SI.getFalseValue(); 1548 bool TDerefable = TValue->isDereferenceablePointer(); 1549 bool FDerefable = FValue->isDereferenceablePointer(); 1550 1551 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1552 UI != UE; ++UI) { 1553 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1554 if (LI == 0 || !LI->isSimple()) return false; 1555 1556 // Both operands to the select need to be dereferencable, either 1557 // absolutely (e.g. allocas) or at this point because we can see other 1558 // accesses to it. 1559 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1560 LI->getAlignment(), &TD)) 1561 return false; 1562 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1563 LI->getAlignment(), &TD)) 1564 return false; 1565 Loads.push_back(LI); 1566 } 1567 1568 return true; 1569 } 1570 1571 void visitSelectInst(SelectInst &SI) { 1572 DEBUG(dbgs() << " original: " << SI << "\n"); 1573 1574 // If the select isn't safe to speculate, just use simple logic to emit it. 1575 SmallVector<LoadInst *, 4> Loads; 1576 if (!isSafeSelectToSpeculate(SI, Loads)) 1577 return; 1578 1579 IRBuilder<> IRB(&SI); 1580 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1581 AllocaPartitioning::iterator PIs[2]; 1582 AllocaPartitioning::PartitionUse PUs[2]; 1583 for (unsigned i = 0, e = 2; i != e; ++i) { 1584 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1585 if (PIs[i] != P.end()) { 1586 // If the pointer is within the partitioning, remove the select from 1587 // its uses. We'll add in the new loads below. 1588 AllocaPartitioning::use_iterator UI 1589 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1590 PUs[i] = *UI; 1591 // Clear out the use here so that the offsets into the use list remain 1592 // stable but this use is ignored when rewriting. 1593 UI->U = 0; 1594 } 1595 } 1596 1597 Value *TV = SI.getTrueValue(); 1598 Value *FV = SI.getFalseValue(); 1599 // Replace the loads of the select with a select of two loads. 1600 while (!Loads.empty()) { 1601 LoadInst *LI = Loads.pop_back_val(); 1602 1603 IRB.SetInsertPoint(LI); 1604 LoadInst *TL = 1605 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1606 LoadInst *FL = 1607 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1608 NumLoadsSpeculated += 2; 1609 1610 // Transfer alignment and TBAA info if present. 1611 TL->setAlignment(LI->getAlignment()); 1612 FL->setAlignment(LI->getAlignment()); 1613 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1614 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1615 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1616 } 1617 1618 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1619 LI->getName() + ".sroa.speculated"); 1620 1621 LoadInst *Loads[2] = { TL, FL }; 1622 for (unsigned i = 0, e = 2; i != e; ++i) { 1623 if (PIs[i] != P.end()) { 1624 Use *LoadUse = &Loads[i]->getOperandUse(0); 1625 assert(PUs[i].U->get() == LoadUse->get()); 1626 PUs[i].U = LoadUse; 1627 P.use_push_back(PIs[i], PUs[i]); 1628 } 1629 } 1630 1631 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1632 LI->replaceAllUsesWith(V); 1633 Pass.DeadInsts.insert(LI); 1634 } 1635 } 1636}; 1637} 1638 1639/// \brief Build a GEP out of a base pointer and indices. 1640/// 1641/// This will return the BasePtr if that is valid, or build a new GEP 1642/// instruction using the IRBuilder if GEP-ing is needed. 1643static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1644 SmallVectorImpl<Value *> &Indices, 1645 const Twine &Prefix) { 1646 if (Indices.empty()) 1647 return BasePtr; 1648 1649 // A single zero index is a no-op, so check for this and avoid building a GEP 1650 // in that case. 1651 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1652 return BasePtr; 1653 1654 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1655} 1656 1657/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1658/// TargetTy without changing the offset of the pointer. 1659/// 1660/// This routine assumes we've already established a properly offset GEP with 1661/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1662/// zero-indices down through type layers until we find one the same as 1663/// TargetTy. If we can't find one with the same type, we at least try to use 1664/// one with the same size. If none of that works, we just produce the GEP as 1665/// indicated by Indices to have the correct offset. 1666static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, 1667 Value *BasePtr, Type *Ty, Type *TargetTy, 1668 SmallVectorImpl<Value *> &Indices, 1669 const Twine &Prefix) { 1670 if (Ty == TargetTy) 1671 return buildGEP(IRB, BasePtr, Indices, Prefix); 1672 1673 // See if we can descend into a struct and locate a field with the correct 1674 // type. 1675 unsigned NumLayers = 0; 1676 Type *ElementTy = Ty; 1677 do { 1678 if (ElementTy->isPointerTy()) 1679 break; 1680 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1681 ElementTy = SeqTy->getElementType(); 1682 // Note that we use the default address space as this index is over an 1683 // array or a vector, not a pointer. 1684 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); 1685 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1686 if (STy->element_begin() == STy->element_end()) 1687 break; // Nothing left to descend into. 1688 ElementTy = *STy->element_begin(); 1689 Indices.push_back(IRB.getInt32(0)); 1690 } else { 1691 break; 1692 } 1693 ++NumLayers; 1694 } while (ElementTy != TargetTy); 1695 if (ElementTy != TargetTy) 1696 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1697 1698 return buildGEP(IRB, BasePtr, Indices, Prefix); 1699} 1700 1701/// \brief Recursively compute indices for a natural GEP. 1702/// 1703/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1704/// element types adding appropriate indices for the GEP. 1705static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, 1706 Value *Ptr, Type *Ty, APInt &Offset, 1707 Type *TargetTy, 1708 SmallVectorImpl<Value *> &Indices, 1709 const Twine &Prefix) { 1710 if (Offset == 0) 1711 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1712 1713 // We can't recurse through pointer types. 1714 if (Ty->isPointerTy()) 1715 return 0; 1716 1717 // We try to analyze GEPs over vectors here, but note that these GEPs are 1718 // extremely poorly defined currently. The long-term goal is to remove GEPing 1719 // over a vector from the IR completely. 1720 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1721 unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); 1722 if (ElementSizeInBits % 8) 1723 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1724 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1725 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1726 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1727 return 0; 1728 Offset -= NumSkippedElements * ElementSize; 1729 Indices.push_back(IRB.getInt(NumSkippedElements)); 1730 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1731 Offset, TargetTy, Indices, Prefix); 1732 } 1733 1734 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1735 Type *ElementTy = ArrTy->getElementType(); 1736 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1737 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1738 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1739 return 0; 1740 1741 Offset -= NumSkippedElements * ElementSize; 1742 Indices.push_back(IRB.getInt(NumSkippedElements)); 1743 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1744 Indices, Prefix); 1745 } 1746 1747 StructType *STy = dyn_cast<StructType>(Ty); 1748 if (!STy) 1749 return 0; 1750 1751 const StructLayout *SL = TD.getStructLayout(STy); 1752 uint64_t StructOffset = Offset.getZExtValue(); 1753 if (StructOffset >= SL->getSizeInBytes()) 1754 return 0; 1755 unsigned Index = SL->getElementContainingOffset(StructOffset); 1756 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1757 Type *ElementTy = STy->getElementType(Index); 1758 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1759 return 0; // The offset points into alignment padding. 1760 1761 Indices.push_back(IRB.getInt32(Index)); 1762 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1763 Indices, Prefix); 1764} 1765 1766/// \brief Get a natural GEP from a base pointer to a particular offset and 1767/// resulting in a particular type. 1768/// 1769/// The goal is to produce a "natural" looking GEP that works with the existing 1770/// composite types to arrive at the appropriate offset and element type for 1771/// a pointer. TargetTy is the element type the returned GEP should point-to if 1772/// possible. We recurse by decreasing Offset, adding the appropriate index to 1773/// Indices, and setting Ty to the result subtype. 1774/// 1775/// If no natural GEP can be constructed, this function returns null. 1776static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, 1777 Value *Ptr, APInt Offset, Type *TargetTy, 1778 SmallVectorImpl<Value *> &Indices, 1779 const Twine &Prefix) { 1780 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1781 1782 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1783 // an i8. 1784 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1785 return 0; 1786 1787 Type *ElementTy = Ty->getElementType(); 1788 if (!ElementTy->isSized()) 1789 return 0; // We can't GEP through an unsized element. 1790 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1791 if (ElementSize == 0) 1792 return 0; // Zero-length arrays can't help us build a natural GEP. 1793 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1794 1795 Offset -= NumSkippedElements * ElementSize; 1796 Indices.push_back(IRB.getInt(NumSkippedElements)); 1797 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1798 Indices, Prefix); 1799} 1800 1801/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1802/// resulting pointer has PointerTy. 1803/// 1804/// This tries very hard to compute a "natural" GEP which arrives at the offset 1805/// and produces the pointer type desired. Where it cannot, it will try to use 1806/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1807/// fails, it will try to use an existing i8* and GEP to the byte offset and 1808/// bitcast to the type. 1809/// 1810/// The strategy for finding the more natural GEPs is to peel off layers of the 1811/// pointer, walking back through bit casts and GEPs, searching for a base 1812/// pointer from which we can compute a natural GEP with the desired 1813/// properties. The algorithm tries to fold as many constant indices into 1814/// a single GEP as possible, thus making each GEP more independent of the 1815/// surrounding code. 1816static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, 1817 Value *Ptr, APInt Offset, Type *PointerTy, 1818 const Twine &Prefix) { 1819 // Even though we don't look through PHI nodes, we could be called on an 1820 // instruction in an unreachable block, which may be on a cycle. 1821 SmallPtrSet<Value *, 4> Visited; 1822 Visited.insert(Ptr); 1823 SmallVector<Value *, 4> Indices; 1824 1825 // We may end up computing an offset pointer that has the wrong type. If we 1826 // never are able to compute one directly that has the correct type, we'll 1827 // fall back to it, so keep it around here. 1828 Value *OffsetPtr = 0; 1829 1830 // Remember any i8 pointer we come across to re-use if we need to do a raw 1831 // byte offset. 1832 Value *Int8Ptr = 0; 1833 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1834 1835 Type *TargetTy = PointerTy->getPointerElementType(); 1836 1837 do { 1838 // First fold any existing GEPs into the offset. 1839 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1840 APInt GEPOffset(Offset.getBitWidth(), 0); 1841 if (!GEP->accumulateConstantOffset(TD, GEPOffset)) 1842 break; 1843 Offset += GEPOffset; 1844 Ptr = GEP->getPointerOperand(); 1845 if (!Visited.insert(Ptr)) 1846 break; 1847 } 1848 1849 // See if we can perform a natural GEP here. 1850 Indices.clear(); 1851 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1852 Indices, Prefix)) { 1853 if (P->getType() == PointerTy) { 1854 // Zap any offset pointer that we ended up computing in previous rounds. 1855 if (OffsetPtr && OffsetPtr->use_empty()) 1856 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1857 I->eraseFromParent(); 1858 return P; 1859 } 1860 if (!OffsetPtr) { 1861 OffsetPtr = P; 1862 } 1863 } 1864 1865 // Stash this pointer if we've found an i8*. 1866 if (Ptr->getType()->isIntegerTy(8)) { 1867 Int8Ptr = Ptr; 1868 Int8PtrOffset = Offset; 1869 } 1870 1871 // Peel off a layer of the pointer and update the offset appropriately. 1872 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1873 Ptr = cast<Operator>(Ptr)->getOperand(0); 1874 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1875 if (GA->mayBeOverridden()) 1876 break; 1877 Ptr = GA->getAliasee(); 1878 } else { 1879 break; 1880 } 1881 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1882 } while (Visited.insert(Ptr)); 1883 1884 if (!OffsetPtr) { 1885 if (!Int8Ptr) { 1886 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1887 Prefix + ".raw_cast"); 1888 Int8PtrOffset = Offset; 1889 } 1890 1891 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1892 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1893 Prefix + ".raw_idx"); 1894 } 1895 Ptr = OffsetPtr; 1896 1897 // On the off chance we were targeting i8*, guard the bitcast here. 1898 if (Ptr->getType() != PointerTy) 1899 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 1900 1901 return Ptr; 1902} 1903 1904/// \brief Test whether we can convert a value from the old to the new type. 1905/// 1906/// This predicate should be used to guard calls to convertValue in order to 1907/// ensure that we only try to convert viable values. The strategy is that we 1908/// will peel off single element struct and array wrappings to get to an 1909/// underlying value, and convert that value. 1910static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1911 if (OldTy == NewTy) 1912 return true; 1913 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 1914 return false; 1915 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1916 return false; 1917 1918 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1919 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 1920 return true; 1921 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 1922 return true; 1923 return false; 1924 } 1925 1926 return true; 1927} 1928 1929/// \brief Generic routine to convert an SSA value to a value of a different 1930/// type. 1931/// 1932/// This will try various different casting techniques, such as bitcasts, 1933/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1934/// two types for viability with this routine. 1935static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 1936 Type *Ty) { 1937 assert(canConvertValue(DL, V->getType(), Ty) && 1938 "Value not convertable to type"); 1939 if (V->getType() == Ty) 1940 return V; 1941 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 1942 return IRB.CreateIntToPtr(V, Ty); 1943 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 1944 return IRB.CreatePtrToInt(V, Ty); 1945 1946 return IRB.CreateBitCast(V, Ty); 1947} 1948 1949/// \brief Test whether the given alloca partition can be promoted to a vector. 1950/// 1951/// This is a quick test to check whether we can rewrite a particular alloca 1952/// partition (and its newly formed alloca) into a vector alloca with only 1953/// whole-vector loads and stores such that it could be promoted to a vector 1954/// SSA value. We only can ensure this for a limited set of operations, and we 1955/// don't want to do the rewrites unless we are confident that the result will 1956/// be promotable, so we have an early test here. 1957static bool isVectorPromotionViable(const DataLayout &TD, 1958 Type *AllocaTy, 1959 AllocaPartitioning &P, 1960 uint64_t PartitionBeginOffset, 1961 uint64_t PartitionEndOffset, 1962 AllocaPartitioning::const_use_iterator I, 1963 AllocaPartitioning::const_use_iterator E) { 1964 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 1965 if (!Ty) 1966 return false; 1967 1968 uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); 1969 1970 // While the definition of LLVM vectors is bitpacked, we don't support sizes 1971 // that aren't byte sized. 1972 if (ElementSize % 8) 1973 return false; 1974 assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && 1975 "vector size not a multiple of element size?"); 1976 ElementSize /= 8; 1977 1978 for (; I != E; ++I) { 1979 if (!I->U) 1980 continue; // Skip dead use. 1981 1982 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 1983 uint64_t BeginIndex = BeginOffset / ElementSize; 1984 if (BeginIndex * ElementSize != BeginOffset || 1985 BeginIndex >= Ty->getNumElements()) 1986 return false; 1987 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 1988 uint64_t EndIndex = EndOffset / ElementSize; 1989 if (EndIndex * ElementSize != EndOffset || 1990 EndIndex > Ty->getNumElements()) 1991 return false; 1992 1993 assert(EndIndex > BeginIndex && "Empty vector!"); 1994 uint64_t NumElements = EndIndex - BeginIndex; 1995 Type *PartitionTy 1996 = (NumElements == 1) ? Ty->getElementType() 1997 : VectorType::get(Ty->getElementType(), NumElements); 1998 1999 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2000 if (MI->isVolatile()) 2001 return false; 2002 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2003 const AllocaPartitioning::MemTransferOffsets &MTO 2004 = P.getMemTransferOffsets(*MTI); 2005 if (!MTO.IsSplittable) 2006 return false; 2007 } 2008 } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { 2009 // Disable vector promotion when there are loads or stores of an FCA. 2010 return false; 2011 } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2012 if (LI->isVolatile()) 2013 return false; 2014 if (!canConvertValue(TD, PartitionTy, LI->getType())) 2015 return false; 2016 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2017 if (SI->isVolatile()) 2018 return false; 2019 if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) 2020 return false; 2021 } else { 2022 return false; 2023 } 2024 } 2025 return true; 2026} 2027 2028/// \brief Test whether the given alloca partition's integer operations can be 2029/// widened to promotable ones. 2030/// 2031/// This is a quick test to check whether we can rewrite the integer loads and 2032/// stores to a particular alloca into wider loads and stores and be able to 2033/// promote the resulting alloca. 2034static bool isIntegerWideningViable(const DataLayout &TD, 2035 Type *AllocaTy, 2036 uint64_t AllocBeginOffset, 2037 AllocaPartitioning &P, 2038 AllocaPartitioning::const_use_iterator I, 2039 AllocaPartitioning::const_use_iterator E) { 2040 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2041 // Don't create integer types larger than the maximum bitwidth. 2042 if (SizeInBits > IntegerType::MAX_INT_BITS) 2043 return false; 2044 2045 // Don't try to handle allocas with bit-padding. 2046 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2047 return false; 2048 2049 // We need to ensure that an integer type with the appropriate bitwidth can 2050 // be converted to the alloca type, whatever that is. We don't want to force 2051 // the alloca itself to have an integer type if there is a more suitable one. 2052 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2053 if (!canConvertValue(TD, AllocaTy, IntTy) || 2054 !canConvertValue(TD, IntTy, AllocaTy)) 2055 return false; 2056 2057 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2058 2059 // Check the uses to ensure the uses are (likely) promotable integer uses. 2060 // Also ensure that the alloca has a covering load or store. We don't want 2061 // to widen the integer operations only to fail to promote due to some other 2062 // unsplittable entry (which we may make splittable later). 2063 bool WholeAllocaOp = false; 2064 for (; I != E; ++I) { 2065 if (!I->U) 2066 continue; // Skip dead use. 2067 2068 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2069 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2070 2071 // We can't reasonably handle cases where the load or store extends past 2072 // the end of the aloca's type and into its padding. 2073 if (RelEnd > Size) 2074 return false; 2075 2076 if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2077 if (LI->isVolatile()) 2078 return false; 2079 if (RelBegin == 0 && RelEnd == Size) 2080 WholeAllocaOp = true; 2081 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2082 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2083 return false; 2084 continue; 2085 } 2086 // Non-integer loads need to be convertible from the alloca type so that 2087 // they are promotable. 2088 if (RelBegin != 0 || RelEnd != Size || 2089 !canConvertValue(TD, AllocaTy, LI->getType())) 2090 return false; 2091 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2092 Type *ValueTy = SI->getValueOperand()->getType(); 2093 if (SI->isVolatile()) 2094 return false; 2095 if (RelBegin == 0 && RelEnd == Size) 2096 WholeAllocaOp = true; 2097 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2098 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2099 return false; 2100 continue; 2101 } 2102 // Non-integer stores need to be convertible to the alloca type so that 2103 // they are promotable. 2104 if (RelBegin != 0 || RelEnd != Size || 2105 !canConvertValue(TD, ValueTy, AllocaTy)) 2106 return false; 2107 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2108 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 2109 return false; 2110 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2111 const AllocaPartitioning::MemTransferOffsets &MTO 2112 = P.getMemTransferOffsets(*MTI); 2113 if (!MTO.IsSplittable) 2114 return false; 2115 } 2116 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { 2117 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2118 II->getIntrinsicID() != Intrinsic::lifetime_end) 2119 return false; 2120 } else { 2121 return false; 2122 } 2123 } 2124 return WholeAllocaOp; 2125} 2126 2127static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2128 IntegerType *Ty, uint64_t Offset, 2129 const Twine &Name) { 2130 DEBUG(dbgs() << " start: " << *V << "\n"); 2131 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2132 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2133 "Element extends past full value"); 2134 uint64_t ShAmt = 8*Offset; 2135 if (DL.isBigEndian()) 2136 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2137 if (ShAmt) { 2138 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2139 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2140 } 2141 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2142 "Cannot extract to a larger integer!"); 2143 if (Ty != IntTy) { 2144 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2145 DEBUG(dbgs() << " trunced: " << *V << "\n"); 2146 } 2147 return V; 2148} 2149 2150static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, 2151 Value *V, uint64_t Offset, const Twine &Name) { 2152 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2153 IntegerType *Ty = cast<IntegerType>(V->getType()); 2154 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2155 "Cannot insert a larger integer!"); 2156 DEBUG(dbgs() << " start: " << *V << "\n"); 2157 if (Ty != IntTy) { 2158 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2159 DEBUG(dbgs() << " extended: " << *V << "\n"); 2160 } 2161 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2162 "Element store outside of alloca store"); 2163 uint64_t ShAmt = 8*Offset; 2164 if (DL.isBigEndian()) 2165 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2166 if (ShAmt) { 2167 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2168 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2169 } 2170 2171 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2172 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2173 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2174 DEBUG(dbgs() << " masked: " << *Old << "\n"); 2175 V = IRB.CreateOr(Old, V, Name + ".insert"); 2176 DEBUG(dbgs() << " inserted: " << *V << "\n"); 2177 } 2178 return V; 2179} 2180 2181static Value *extractVector(IRBuilder<> &IRB, Value *V, 2182 unsigned BeginIndex, unsigned EndIndex, 2183 const Twine &Name) { 2184 VectorType *VecTy = cast<VectorType>(V->getType()); 2185 unsigned NumElements = EndIndex - BeginIndex; 2186 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2187 2188 if (NumElements == VecTy->getNumElements()) 2189 return V; 2190 2191 if (NumElements == 1) { 2192 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2193 Name + ".extract"); 2194 DEBUG(dbgs() << " extract: " << *V << "\n"); 2195 return V; 2196 } 2197 2198 SmallVector<Constant*, 8> Mask; 2199 Mask.reserve(NumElements); 2200 for (unsigned i = BeginIndex; i != EndIndex; ++i) 2201 Mask.push_back(IRB.getInt32(i)); 2202 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2203 ConstantVector::get(Mask), 2204 Name + ".extract"); 2205 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2206 return V; 2207} 2208 2209static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V, 2210 unsigned BeginIndex, const Twine &Name) { 2211 VectorType *VecTy = cast<VectorType>(Old->getType()); 2212 assert(VecTy && "Can only insert a vector into a vector"); 2213 2214 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 2215 if (!Ty) { 2216 // Single element to insert. 2217 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 2218 Name + ".insert"); 2219 DEBUG(dbgs() << " insert: " << *V << "\n"); 2220 return V; 2221 } 2222 2223 assert(Ty->getNumElements() <= VecTy->getNumElements() && 2224 "Too many elements!"); 2225 if (Ty->getNumElements() == VecTy->getNumElements()) { 2226 assert(V->getType() == VecTy && "Vector type mismatch"); 2227 return V; 2228 } 2229 unsigned EndIndex = BeginIndex + Ty->getNumElements(); 2230 2231 // When inserting a smaller vector into the larger to store, we first 2232 // use a shuffle vector to widen it with undef elements, and then 2233 // a second shuffle vector to select between the loaded vector and the 2234 // incoming vector. 2235 SmallVector<Constant*, 8> Mask; 2236 Mask.reserve(VecTy->getNumElements()); 2237 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2238 if (i >= BeginIndex && i < EndIndex) 2239 Mask.push_back(IRB.getInt32(i - BeginIndex)); 2240 else 2241 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 2242 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2243 ConstantVector::get(Mask), 2244 Name + ".expand"); 2245 DEBUG(dbgs() << " shuffle1: " << *V << "\n"); 2246 2247 Mask.clear(); 2248 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2249 if (i >= BeginIndex && i < EndIndex) 2250 Mask.push_back(IRB.getInt32(i)); 2251 else 2252 Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); 2253 V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask), 2254 Name + "insert"); 2255 DEBUG(dbgs() << " shuffle2: " << *V << "\n"); 2256 return V; 2257} 2258 2259namespace { 2260/// \brief Visitor to rewrite instructions using a partition of an alloca to 2261/// use a new alloca. 2262/// 2263/// Also implements the rewriting to vector-based accesses when the partition 2264/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2265/// lives here. 2266class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2267 bool> { 2268 // Befriend the base class so it can delegate to private visit methods. 2269 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2270 2271 const DataLayout &TD; 2272 AllocaPartitioning &P; 2273 SROA &Pass; 2274 AllocaInst &OldAI, &NewAI; 2275 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2276 Type *NewAllocaTy; 2277 2278 // If we are rewriting an alloca partition which can be written as pure 2279 // vector operations, we stash extra information here. When VecTy is 2280 // non-null, we have some strict guarantees about the rewritten alloca: 2281 // - The new alloca is exactly the size of the vector type here. 2282 // - The accesses all either map to the entire vector or to a single 2283 // element. 2284 // - The set of accessing instructions is only one of those handled above 2285 // in isVectorPromotionViable. Generally these are the same access kinds 2286 // which are promotable via mem2reg. 2287 VectorType *VecTy; 2288 Type *ElementTy; 2289 uint64_t ElementSize; 2290 2291 // This is a convenience and flag variable that will be null unless the new 2292 // alloca's integer operations should be widened to this integer type due to 2293 // passing isIntegerWideningViable above. If it is non-null, the desired 2294 // integer type will be stored here for easy access during rewriting. 2295 IntegerType *IntTy; 2296 2297 // The offset of the partition user currently being rewritten. 2298 uint64_t BeginOffset, EndOffset; 2299 Use *OldUse; 2300 Instruction *OldPtr; 2301 2302 // The name prefix to use when rewriting instructions for this alloca. 2303 std::string NamePrefix; 2304 2305public: 2306 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2307 AllocaPartitioning::iterator PI, 2308 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2309 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2310 : TD(TD), P(P), Pass(Pass), 2311 OldAI(OldAI), NewAI(NewAI), 2312 NewAllocaBeginOffset(NewBeginOffset), 2313 NewAllocaEndOffset(NewEndOffset), 2314 NewAllocaTy(NewAI.getAllocatedType()), 2315 VecTy(), ElementTy(), ElementSize(), IntTy(), 2316 BeginOffset(), EndOffset() { 2317 } 2318 2319 /// \brief Visit the users of the alloca partition and rewrite them. 2320 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2321 AllocaPartitioning::const_use_iterator E) { 2322 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2323 NewAllocaBeginOffset, NewAllocaEndOffset, 2324 I, E)) { 2325 ++NumVectorized; 2326 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2327 ElementTy = VecTy->getElementType(); 2328 assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && 2329 "Only multiple-of-8 sized vector elements are viable"); 2330 ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; 2331 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2332 NewAllocaBeginOffset, P, I, E)) { 2333 IntTy = Type::getIntNTy(NewAI.getContext(), 2334 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2335 } 2336 bool CanSROA = true; 2337 for (; I != E; ++I) { 2338 if (!I->U) 2339 continue; // Skip dead uses. 2340 BeginOffset = I->BeginOffset; 2341 EndOffset = I->EndOffset; 2342 OldUse = I->U; 2343 OldPtr = cast<Instruction>(I->U->get()); 2344 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2345 CanSROA &= visit(cast<Instruction>(I->U->getUser())); 2346 } 2347 if (VecTy) { 2348 assert(CanSROA); 2349 VecTy = 0; 2350 ElementTy = 0; 2351 ElementSize = 0; 2352 } 2353 if (IntTy) { 2354 assert(CanSROA); 2355 IntTy = 0; 2356 } 2357 return CanSROA; 2358 } 2359 2360private: 2361 // Every instruction which can end up as a user must have a rewrite rule. 2362 bool visitInstruction(Instruction &I) { 2363 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2364 llvm_unreachable("No rewrite rule for this instruction!"); 2365 } 2366 2367 Twine getName(const Twine &Suffix) { 2368 return NamePrefix + Suffix; 2369 } 2370 2371 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 2372 assert(BeginOffset >= NewAllocaBeginOffset); 2373 APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); 2374 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2375 } 2376 2377 /// \brief Compute suitable alignment to access an offset into the new alloca. 2378 unsigned getOffsetAlign(uint64_t Offset) { 2379 unsigned NewAIAlign = NewAI.getAlignment(); 2380 if (!NewAIAlign) 2381 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2382 return MinAlign(NewAIAlign, Offset); 2383 } 2384 2385 /// \brief Compute suitable alignment to access this partition of the new 2386 /// alloca. 2387 unsigned getPartitionAlign() { 2388 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2389 } 2390 2391 /// \brief Compute suitable alignment to access a type at an offset of the 2392 /// new alloca. 2393 /// 2394 /// \returns zero if the type's ABI alignment is a suitable alignment, 2395 /// otherwise returns the maximal suitable alignment. 2396 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2397 unsigned Align = getOffsetAlign(Offset); 2398 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2399 } 2400 2401 /// \brief Compute suitable alignment to access a type at the beginning of 2402 /// this partition of the new alloca. 2403 /// 2404 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2405 unsigned getPartitionTypeAlign(Type *Ty) { 2406 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2407 } 2408 2409 unsigned getIndex(uint64_t Offset) { 2410 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2411 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2412 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2413 uint32_t Index = RelOffset / ElementSize; 2414 assert(Index * ElementSize == RelOffset); 2415 return Index; 2416 } 2417 2418 void deleteIfTriviallyDead(Value *V) { 2419 Instruction *I = cast<Instruction>(V); 2420 if (isInstructionTriviallyDead(I)) 2421 Pass.DeadInsts.insert(I); 2422 } 2423 2424 Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) { 2425 unsigned BeginIndex = getIndex(BeginOffset); 2426 unsigned EndIndex = getIndex(EndOffset); 2427 assert(EndIndex > BeginIndex && "Empty vector!"); 2428 2429 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2430 getName(".load")); 2431 return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec")); 2432 } 2433 2434 Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 2435 assert(IntTy && "We cannot insert an integer to the alloca"); 2436 assert(!LI.isVolatile()); 2437 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2438 getName(".load")); 2439 V = convertValue(TD, IRB, V, IntTy); 2440 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2441 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2442 if (Offset > 0 || EndOffset < NewAllocaEndOffset) 2443 V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2444 getName(".extract")); 2445 return V; 2446 } 2447 2448 bool visitLoadInst(LoadInst &LI) { 2449 DEBUG(dbgs() << " original: " << LI << "\n"); 2450 Value *OldOp = LI.getOperand(0); 2451 assert(OldOp == OldPtr); 2452 2453 uint64_t Size = EndOffset - BeginOffset; 2454 bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); 2455 2456 // If this memory access can be shown to *statically* extend outside the 2457 // bounds of the original allocation it's behavior is undefined. Rather 2458 // than trying to transform it, just replace it with undef. 2459 // FIXME: We should do something more clever for functions being 2460 // instrumented by asan. 2461 // FIXME: Eventually, once ASan and friends can flush out bugs here, this 2462 // should be transformed to a load of null making it unreachable. 2463 uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); 2464 if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { 2465 LI.replaceAllUsesWith(UndefValue::get(LI.getType())); 2466 Pass.DeadInsts.insert(&LI); 2467 deleteIfTriviallyDead(OldOp); 2468 DEBUG(dbgs() << " to: undef!!\n"); 2469 return true; 2470 } 2471 2472 IRBuilder<> IRB(&LI); 2473 Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) 2474 : LI.getType(); 2475 bool IsPtrAdjusted = false; 2476 Value *V; 2477 if (VecTy) { 2478 V = rewriteVectorizedLoadInst(IRB); 2479 } else if (IntTy && LI.getType()->isIntegerTy()) { 2480 V = rewriteIntegerLoad(IRB, LI); 2481 } else if (BeginOffset == NewAllocaBeginOffset && 2482 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2483 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2484 LI.isVolatile(), getName(".load")); 2485 } else { 2486 Type *LTy = TargetTy->getPointerTo(); 2487 V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), 2488 getPartitionTypeAlign(TargetTy), 2489 LI.isVolatile(), getName(".load")); 2490 IsPtrAdjusted = true; 2491 } 2492 V = convertValue(TD, IRB, V, TargetTy); 2493 2494 if (IsSplitIntLoad) { 2495 assert(!LI.isVolatile()); 2496 assert(LI.getType()->isIntegerTy() && 2497 "Only integer type loads and stores are split"); 2498 assert(LI.getType()->getIntegerBitWidth() == 2499 TD.getTypeStoreSizeInBits(LI.getType()) && 2500 "Non-byte-multiple bit width"); 2501 assert(LI.getType()->getIntegerBitWidth() == 2502 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2503 "Only alloca-wide loads can be split and recomposed"); 2504 // Move the insertion point just past the load so that we can refer to it. 2505 IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); 2506 // Create a placeholder value with the same type as LI to use as the 2507 // basis for the new value. This allows us to replace the uses of LI with 2508 // the computed value, and then replace the placeholder with LI, leaving 2509 // LI only used for this computation. 2510 Value *Placeholder 2511 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 2512 V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, 2513 getName(".insert")); 2514 LI.replaceAllUsesWith(V); 2515 Placeholder->replaceAllUsesWith(&LI); 2516 delete Placeholder; 2517 } else { 2518 LI.replaceAllUsesWith(V); 2519 } 2520 2521 Pass.DeadInsts.insert(&LI); 2522 deleteIfTriviallyDead(OldOp); 2523 DEBUG(dbgs() << " to: " << *V << "\n"); 2524 return !LI.isVolatile() && !IsPtrAdjusted; 2525 } 2526 2527 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, 2528 StoreInst &SI, Value *OldOp) { 2529 unsigned BeginIndex = getIndex(BeginOffset); 2530 unsigned EndIndex = getIndex(EndOffset); 2531 assert(EndIndex > BeginIndex && "Empty vector!"); 2532 unsigned NumElements = EndIndex - BeginIndex; 2533 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2534 Type *PartitionTy 2535 = (NumElements == 1) ? ElementTy 2536 : VectorType::get(ElementTy, NumElements); 2537 if (V->getType() != PartitionTy) 2538 V = convertValue(TD, IRB, V, PartitionTy); 2539 2540 // Mix in the existing elements. 2541 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2542 getName(".load")); 2543 V = insertVector(IRB, Old, V, BeginIndex, getName(".vec")); 2544 2545 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2546 Pass.DeadInsts.insert(&SI); 2547 2548 (void)Store; 2549 DEBUG(dbgs() << " to: " << *Store << "\n"); 2550 return true; 2551 } 2552 2553 bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { 2554 assert(IntTy && "We cannot extract an integer from the alloca"); 2555 assert(!SI.isVolatile()); 2556 if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2557 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2558 getName(".oldload")); 2559 Old = convertValue(TD, IRB, Old, IntTy); 2560 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2561 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2562 V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, 2563 getName(".insert")); 2564 } 2565 V = convertValue(TD, IRB, V, NewAllocaTy); 2566 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2567 Pass.DeadInsts.insert(&SI); 2568 (void)Store; 2569 DEBUG(dbgs() << " to: " << *Store << "\n"); 2570 return true; 2571 } 2572 2573 bool visitStoreInst(StoreInst &SI) { 2574 DEBUG(dbgs() << " original: " << SI << "\n"); 2575 Value *OldOp = SI.getOperand(1); 2576 assert(OldOp == OldPtr); 2577 IRBuilder<> IRB(&SI); 2578 2579 Value *V = SI.getValueOperand(); 2580 2581 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2582 // alloca that should be re-examined after promoting this alloca. 2583 if (V->getType()->isPointerTy()) 2584 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2585 Pass.PostPromotionWorklist.insert(AI); 2586 2587 uint64_t Size = EndOffset - BeginOffset; 2588 if (Size < TD.getTypeStoreSize(V->getType())) { 2589 assert(!SI.isVolatile()); 2590 assert(V->getType()->isIntegerTy() && 2591 "Only integer type loads and stores are split"); 2592 assert(V->getType()->getIntegerBitWidth() == 2593 TD.getTypeStoreSizeInBits(V->getType()) && 2594 "Non-byte-multiple bit width"); 2595 assert(V->getType()->getIntegerBitWidth() == 2596 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2597 "Only alloca-wide stores can be split and recomposed"); 2598 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); 2599 V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, 2600 getName(".extract")); 2601 } 2602 2603 if (VecTy) 2604 return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); 2605 if (IntTy && V->getType()->isIntegerTy()) 2606 return rewriteIntegerStore(IRB, V, SI); 2607 2608 StoreInst *NewSI; 2609 if (BeginOffset == NewAllocaBeginOffset && 2610 canConvertValue(TD, V->getType(), NewAllocaTy)) { 2611 V = convertValue(TD, IRB, V, NewAllocaTy); 2612 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2613 SI.isVolatile()); 2614 } else { 2615 Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); 2616 NewSI = IRB.CreateAlignedStore(V, NewPtr, 2617 getPartitionTypeAlign(V->getType()), 2618 SI.isVolatile()); 2619 } 2620 (void)NewSI; 2621 Pass.DeadInsts.insert(&SI); 2622 deleteIfTriviallyDead(OldOp); 2623 2624 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2625 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 2626 } 2627 2628 /// \brief Compute an integer value from splatting an i8 across the given 2629 /// number of bytes. 2630 /// 2631 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 2632 /// call this routine. 2633 /// FIXME: Heed the advice above. 2634 /// 2635 /// \param V The i8 value to splat. 2636 /// \param Size The number of bytes in the output (assuming i8 is one byte) 2637 Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) { 2638 assert(Size > 0 && "Expected a positive number of bytes."); 2639 IntegerType *VTy = cast<IntegerType>(V->getType()); 2640 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 2641 if (Size == 1) 2642 return V; 2643 2644 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2645 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2646 ConstantExpr::getUDiv( 2647 Constant::getAllOnesValue(SplatIntTy), 2648 ConstantExpr::getZExt( 2649 Constant::getAllOnesValue(V->getType()), 2650 SplatIntTy)), 2651 getName(".isplat")); 2652 return V; 2653 } 2654 2655 /// \brief Compute a vector splat for a given element value. 2656 Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) { 2657 V = IRB.CreateVectorSplat(NumElements, V, NamePrefix); 2658 DEBUG(dbgs() << " splat: " << *V << "\n"); 2659 return V; 2660 } 2661 2662 bool visitMemSetInst(MemSetInst &II) { 2663 DEBUG(dbgs() << " original: " << II << "\n"); 2664 IRBuilder<> IRB(&II); 2665 assert(II.getRawDest() == OldPtr); 2666 2667 // If the memset has a variable size, it cannot be split, just adjust the 2668 // pointer to the new alloca. 2669 if (!isa<Constant>(II.getLength())) { 2670 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2671 Type *CstTy = II.getAlignmentCst()->getType(); 2672 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2673 2674 deleteIfTriviallyDead(OldPtr); 2675 return false; 2676 } 2677 2678 // Record this instruction for deletion. 2679 Pass.DeadInsts.insert(&II); 2680 2681 Type *AllocaTy = NewAI.getAllocatedType(); 2682 Type *ScalarTy = AllocaTy->getScalarType(); 2683 2684 // If this doesn't map cleanly onto the alloca type, and that type isn't 2685 // a single value type, just emit a memset. 2686 if (!VecTy && !IntTy && 2687 (BeginOffset != NewAllocaBeginOffset || 2688 EndOffset != NewAllocaEndOffset || 2689 !AllocaTy->isSingleValueType() || 2690 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || 2691 TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { 2692 Type *SizeTy = II.getLength()->getType(); 2693 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2694 CallInst *New 2695 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2696 II.getRawDest()->getType()), 2697 II.getValue(), Size, getPartitionAlign(), 2698 II.isVolatile()); 2699 (void)New; 2700 DEBUG(dbgs() << " to: " << *New << "\n"); 2701 return false; 2702 } 2703 2704 // If we can represent this as a simple value, we have to build the actual 2705 // value to store, which requires expanding the byte present in memset to 2706 // a sensible representation for the alloca type. This is essentially 2707 // splatting the byte to a sufficiently wide integer, splatting it across 2708 // any desired vector width, and bitcasting to the final type. 2709 Value *V; 2710 2711 if (VecTy) { 2712 // If this is a memset of a vectorized alloca, insert it. 2713 assert(ElementTy == ScalarTy); 2714 2715 unsigned BeginIndex = getIndex(BeginOffset); 2716 unsigned EndIndex = getIndex(EndOffset); 2717 assert(EndIndex > BeginIndex && "Empty vector!"); 2718 unsigned NumElements = EndIndex - BeginIndex; 2719 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2720 2721 Value *Splat = getIntegerSplat(IRB, II.getValue(), 2722 TD.getTypeSizeInBits(ElementTy)/8); 2723 Splat = convertValue(TD, IRB, Splat, ElementTy); 2724 if (NumElements > 1) 2725 Splat = getVectorSplat(IRB, Splat, NumElements); 2726 2727 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2728 getName(".oldload")); 2729 V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec")); 2730 } else if (IntTy) { 2731 // If this is a memset on an alloca where we can widen stores, insert the 2732 // set integer. 2733 assert(!II.isVolatile()); 2734 2735 uint64_t Size = EndOffset - BeginOffset; 2736 V = getIntegerSplat(IRB, II.getValue(), Size); 2737 2738 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 2739 EndOffset != NewAllocaBeginOffset)) { 2740 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2741 getName(".oldload")); 2742 Old = convertValue(TD, IRB, Old, IntTy); 2743 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2744 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2745 V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); 2746 } else { 2747 assert(V->getType() == IntTy && 2748 "Wrong type for an alloca wide integer!"); 2749 } 2750 V = convertValue(TD, IRB, V, AllocaTy); 2751 } else { 2752 // Established these invariants above. 2753 assert(BeginOffset == NewAllocaBeginOffset); 2754 assert(EndOffset == NewAllocaEndOffset); 2755 2756 V = getIntegerSplat(IRB, II.getValue(), 2757 TD.getTypeSizeInBits(ScalarTy)/8); 2758 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 2759 V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements()); 2760 2761 V = convertValue(TD, IRB, V, AllocaTy); 2762 } 2763 2764 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2765 II.isVolatile()); 2766 (void)New; 2767 DEBUG(dbgs() << " to: " << *New << "\n"); 2768 return !II.isVolatile(); 2769 } 2770 2771 bool visitMemTransferInst(MemTransferInst &II) { 2772 // Rewriting of memory transfer instructions can be a bit tricky. We break 2773 // them into two categories: split intrinsics and unsplit intrinsics. 2774 2775 DEBUG(dbgs() << " original: " << II << "\n"); 2776 IRBuilder<> IRB(&II); 2777 2778 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2779 bool IsDest = II.getRawDest() == OldPtr; 2780 2781 const AllocaPartitioning::MemTransferOffsets &MTO 2782 = P.getMemTransferOffsets(II); 2783 2784 // Compute the relative offset within the transfer. 2785 unsigned IntPtrWidth = TD.getPointerSizeInBits(); 2786 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2787 : MTO.SourceBegin)); 2788 2789 unsigned Align = II.getAlignment(); 2790 if (Align > 1) 2791 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2792 MinAlign(II.getAlignment(), getPartitionAlign())); 2793 2794 // For unsplit intrinsics, we simply modify the source and destination 2795 // pointers in place. This isn't just an optimization, it is a matter of 2796 // correctness. With unsplit intrinsics we may be dealing with transfers 2797 // within a single alloca before SROA ran, or with transfers that have 2798 // a variable length. We may also be dealing with memmove instead of 2799 // memcpy, and so simply updating the pointers is the necessary for us to 2800 // update both source and dest of a single call. 2801 if (!MTO.IsSplittable) { 2802 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2803 if (IsDest) 2804 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2805 else 2806 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2807 2808 Type *CstTy = II.getAlignmentCst()->getType(); 2809 II.setAlignment(ConstantInt::get(CstTy, Align)); 2810 2811 DEBUG(dbgs() << " to: " << II << "\n"); 2812 deleteIfTriviallyDead(OldOp); 2813 return false; 2814 } 2815 // For split transfer intrinsics we have an incredibly useful assurance: 2816 // the source and destination do not reside within the same alloca, and at 2817 // least one of them does not escape. This means that we can replace 2818 // memmove with memcpy, and we don't need to worry about all manner of 2819 // downsides to splitting and transforming the operations. 2820 2821 // If this doesn't map cleanly onto the alloca type, and that type isn't 2822 // a single value type, just emit a memcpy. 2823 bool EmitMemCpy 2824 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2825 EndOffset != NewAllocaEndOffset || 2826 !NewAI.getAllocatedType()->isSingleValueType()); 2827 2828 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2829 // size hasn't been shrunk based on analysis of the viable range, this is 2830 // a no-op. 2831 if (EmitMemCpy && &OldAI == &NewAI) { 2832 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2833 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2834 // Ensure the start lines up. 2835 assert(BeginOffset == OrigBegin); 2836 (void)OrigBegin; 2837 2838 // Rewrite the size as needed. 2839 if (EndOffset != OrigEnd) 2840 II.setLength(ConstantInt::get(II.getLength()->getType(), 2841 EndOffset - BeginOffset)); 2842 return false; 2843 } 2844 // Record this instruction for deletion. 2845 Pass.DeadInsts.insert(&II); 2846 2847 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2848 // alloca that should be re-examined after rewriting this instruction. 2849 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2850 if (AllocaInst *AI 2851 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2852 Pass.Worklist.insert(AI); 2853 2854 if (EmitMemCpy) { 2855 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2856 : II.getRawDest()->getType(); 2857 2858 // Compute the other pointer, folding as much as possible to produce 2859 // a single, simple GEP in most cases. 2860 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2861 getName("." + OtherPtr->getName())); 2862 2863 Value *OurPtr 2864 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2865 : II.getRawSource()->getType()); 2866 Type *SizeTy = II.getLength()->getType(); 2867 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2868 2869 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2870 IsDest ? OtherPtr : OurPtr, 2871 Size, Align, II.isVolatile()); 2872 (void)New; 2873 DEBUG(dbgs() << " to: " << *New << "\n"); 2874 return false; 2875 } 2876 2877 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2878 // is equivalent to 1, but that isn't true if we end up rewriting this as 2879 // a load or store. 2880 if (!Align) 2881 Align = 1; 2882 2883 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2884 EndOffset == NewAllocaEndOffset; 2885 uint64_t Size = EndOffset - BeginOffset; 2886 unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; 2887 unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; 2888 unsigned NumElements = EndIndex - BeginIndex; 2889 IntegerType *SubIntTy 2890 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2891 2892 Type *OtherPtrTy = NewAI.getType(); 2893 if (VecTy && !IsWholeAlloca) { 2894 if (NumElements == 1) 2895 OtherPtrTy = VecTy->getElementType(); 2896 else 2897 OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); 2898 2899 OtherPtrTy = OtherPtrTy->getPointerTo(); 2900 } else if (IntTy && !IsWholeAlloca) { 2901 OtherPtrTy = SubIntTy->getPointerTo(); 2902 } 2903 2904 Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2905 getName("." + OtherPtr->getName())); 2906 Value *DstPtr = &NewAI; 2907 if (!IsDest) 2908 std::swap(SrcPtr, DstPtr); 2909 2910 Value *Src; 2911 if (VecTy && !IsWholeAlloca && !IsDest) { 2912 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2913 getName(".load")); 2914 Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); 2915 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2916 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2917 getName(".load")); 2918 Src = convertValue(TD, IRB, Src, IntTy); 2919 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2920 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2921 Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); 2922 } else { 2923 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2924 getName(".copyload")); 2925 } 2926 2927 if (VecTy && !IsWholeAlloca && IsDest) { 2928 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2929 getName(".oldload")); 2930 Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); 2931 } else if (IntTy && !IsWholeAlloca && IsDest) { 2932 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2933 getName(".oldload")); 2934 Old = convertValue(TD, IRB, Old, IntTy); 2935 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2936 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2937 Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); 2938 Src = convertValue(TD, IRB, Src, NewAllocaTy); 2939 } 2940 2941 StoreInst *Store = cast<StoreInst>( 2942 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2943 (void)Store; 2944 DEBUG(dbgs() << " to: " << *Store << "\n"); 2945 return !II.isVolatile(); 2946 } 2947 2948 bool visitIntrinsicInst(IntrinsicInst &II) { 2949 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2950 II.getIntrinsicID() == Intrinsic::lifetime_end); 2951 DEBUG(dbgs() << " original: " << II << "\n"); 2952 IRBuilder<> IRB(&II); 2953 assert(II.getArgOperand(1) == OldPtr); 2954 2955 // Record this instruction for deletion. 2956 Pass.DeadInsts.insert(&II); 2957 2958 ConstantInt *Size 2959 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2960 EndOffset - BeginOffset); 2961 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2962 Value *New; 2963 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2964 New = IRB.CreateLifetimeStart(Ptr, Size); 2965 else 2966 New = IRB.CreateLifetimeEnd(Ptr, Size); 2967 2968 (void)New; 2969 DEBUG(dbgs() << " to: " << *New << "\n"); 2970 return true; 2971 } 2972 2973 bool visitPHINode(PHINode &PN) { 2974 DEBUG(dbgs() << " original: " << PN << "\n"); 2975 2976 // We would like to compute a new pointer in only one place, but have it be 2977 // as local as possible to the PHI. To do that, we re-use the location of 2978 // the old pointer, which necessarily must be in the right position to 2979 // dominate the PHI. 2980 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2981 2982 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2983 // Replace the operands which were using the old pointer. 2984 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 2985 2986 DEBUG(dbgs() << " to: " << PN << "\n"); 2987 deleteIfTriviallyDead(OldPtr); 2988 return false; 2989 } 2990 2991 bool visitSelectInst(SelectInst &SI) { 2992 DEBUG(dbgs() << " original: " << SI << "\n"); 2993 IRBuilder<> IRB(&SI); 2994 2995 // Find the operand we need to rewrite here. 2996 bool IsTrueVal = SI.getTrueValue() == OldPtr; 2997 if (IsTrueVal) 2998 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 2999 else 3000 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 3001 3002 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 3003 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 3004 DEBUG(dbgs() << " to: " << SI << "\n"); 3005 deleteIfTriviallyDead(OldPtr); 3006 return false; 3007 } 3008 3009}; 3010} 3011 3012namespace { 3013/// \brief Visitor to rewrite aggregate loads and stores as scalar. 3014/// 3015/// This pass aggressively rewrites all aggregate loads and stores on 3016/// a particular pointer (or any pointer derived from it which we can identify) 3017/// with scalar loads and stores. 3018class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3019 // Befriend the base class so it can delegate to private visit methods. 3020 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 3021 3022 const DataLayout &TD; 3023 3024 /// Queue of pointer uses to analyze and potentially rewrite. 3025 SmallVector<Use *, 8> Queue; 3026 3027 /// Set to prevent us from cycling with phi nodes and loops. 3028 SmallPtrSet<User *, 8> Visited; 3029 3030 /// The current pointer use being rewritten. This is used to dig up the used 3031 /// value (as opposed to the user). 3032 Use *U; 3033 3034public: 3035 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 3036 3037 /// Rewrite loads and stores through a pointer and all pointers derived from 3038 /// it. 3039 bool rewrite(Instruction &I) { 3040 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3041 enqueueUsers(I); 3042 bool Changed = false; 3043 while (!Queue.empty()) { 3044 U = Queue.pop_back_val(); 3045 Changed |= visit(cast<Instruction>(U->getUser())); 3046 } 3047 return Changed; 3048 } 3049 3050private: 3051 /// Enqueue all the users of the given instruction for further processing. 3052 /// This uses a set to de-duplicate users. 3053 void enqueueUsers(Instruction &I) { 3054 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 3055 ++UI) 3056 if (Visited.insert(*UI)) 3057 Queue.push_back(&UI.getUse()); 3058 } 3059 3060 // Conservative default is to not rewrite anything. 3061 bool visitInstruction(Instruction &I) { return false; } 3062 3063 /// \brief Generic recursive split emission class. 3064 template <typename Derived> 3065 class OpSplitter { 3066 protected: 3067 /// The builder used to form new instructions. 3068 IRBuilder<> IRB; 3069 /// The indices which to be used with insert- or extractvalue to select the 3070 /// appropriate value within the aggregate. 3071 SmallVector<unsigned, 4> Indices; 3072 /// The indices to a GEP instruction which will move Ptr to the correct slot 3073 /// within the aggregate. 3074 SmallVector<Value *, 4> GEPIndices; 3075 /// The base pointer of the original op, used as a base for GEPing the 3076 /// split operations. 3077 Value *Ptr; 3078 3079 /// Initialize the splitter with an insertion point, Ptr and start with a 3080 /// single zero GEP index. 3081 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 3082 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 3083 3084 public: 3085 /// \brief Generic recursive split emission routine. 3086 /// 3087 /// This method recursively splits an aggregate op (load or store) into 3088 /// scalar or vector ops. It splits recursively until it hits a single value 3089 /// and emits that single value operation via the template argument. 3090 /// 3091 /// The logic of this routine relies on GEPs and insertvalue and 3092 /// extractvalue all operating with the same fundamental index list, merely 3093 /// formatted differently (GEPs need actual values). 3094 /// 3095 /// \param Ty The type being split recursively into smaller ops. 3096 /// \param Agg The aggregate value being built up or stored, depending on 3097 /// whether this is splitting a load or a store respectively. 3098 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3099 if (Ty->isSingleValueType()) 3100 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 3101 3102 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3103 unsigned OldSize = Indices.size(); 3104 (void)OldSize; 3105 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3106 ++Idx) { 3107 assert(Indices.size() == OldSize && "Did not return to the old size"); 3108 Indices.push_back(Idx); 3109 GEPIndices.push_back(IRB.getInt32(Idx)); 3110 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3111 GEPIndices.pop_back(); 3112 Indices.pop_back(); 3113 } 3114 return; 3115 } 3116 3117 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3118 unsigned OldSize = Indices.size(); 3119 (void)OldSize; 3120 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3121 ++Idx) { 3122 assert(Indices.size() == OldSize && "Did not return to the old size"); 3123 Indices.push_back(Idx); 3124 GEPIndices.push_back(IRB.getInt32(Idx)); 3125 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3126 GEPIndices.pop_back(); 3127 Indices.pop_back(); 3128 } 3129 return; 3130 } 3131 3132 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3133 } 3134 }; 3135 3136 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3137 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3138 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3139 3140 /// Emit a leaf load of a single value. This is called at the leaves of the 3141 /// recursive emission to actually load values. 3142 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3143 assert(Ty->isSingleValueType()); 3144 // Load the single value and insert it using the indices. 3145 Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); 3146 Value *Load = IRB.CreateLoad(GEP, Name + ".load"); 3147 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3148 DEBUG(dbgs() << " to: " << *Load << "\n"); 3149 } 3150 }; 3151 3152 bool visitLoadInst(LoadInst &LI) { 3153 assert(LI.getPointerOperand() == *U); 3154 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3155 return false; 3156 3157 // We have an aggregate being loaded, split it apart. 3158 DEBUG(dbgs() << " original: " << LI << "\n"); 3159 LoadOpSplitter Splitter(&LI, *U); 3160 Value *V = UndefValue::get(LI.getType()); 3161 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3162 LI.replaceAllUsesWith(V); 3163 LI.eraseFromParent(); 3164 return true; 3165 } 3166 3167 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3168 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3169 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3170 3171 /// Emit a leaf store of a single value. This is called at the leaves of the 3172 /// recursive emission to actually produce stores. 3173 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3174 assert(Ty->isSingleValueType()); 3175 // Extract the single value and store it using the indices. 3176 Value *Store = IRB.CreateStore( 3177 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3178 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3179 (void)Store; 3180 DEBUG(dbgs() << " to: " << *Store << "\n"); 3181 } 3182 }; 3183 3184 bool visitStoreInst(StoreInst &SI) { 3185 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3186 return false; 3187 Value *V = SI.getValueOperand(); 3188 if (V->getType()->isSingleValueType()) 3189 return false; 3190 3191 // We have an aggregate being stored, split it apart. 3192 DEBUG(dbgs() << " original: " << SI << "\n"); 3193 StoreOpSplitter Splitter(&SI, *U); 3194 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3195 SI.eraseFromParent(); 3196 return true; 3197 } 3198 3199 bool visitBitCastInst(BitCastInst &BC) { 3200 enqueueUsers(BC); 3201 return false; 3202 } 3203 3204 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3205 enqueueUsers(GEPI); 3206 return false; 3207 } 3208 3209 bool visitPHINode(PHINode &PN) { 3210 enqueueUsers(PN); 3211 return false; 3212 } 3213 3214 bool visitSelectInst(SelectInst &SI) { 3215 enqueueUsers(SI); 3216 return false; 3217 } 3218}; 3219} 3220 3221/// \brief Strip aggregate type wrapping. 3222/// 3223/// This removes no-op aggregate types wrapping an underlying type. It will 3224/// strip as many layers of types as it can without changing either the type 3225/// size or the allocated size. 3226static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3227 if (Ty->isSingleValueType()) 3228 return Ty; 3229 3230 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3231 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3232 3233 Type *InnerTy; 3234 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3235 InnerTy = ArrTy->getElementType(); 3236 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3237 const StructLayout *SL = DL.getStructLayout(STy); 3238 unsigned Index = SL->getElementContainingOffset(0); 3239 InnerTy = STy->getElementType(Index); 3240 } else { 3241 return Ty; 3242 } 3243 3244 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3245 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3246 return Ty; 3247 3248 return stripAggregateTypeWrapping(DL, InnerTy); 3249} 3250 3251/// \brief Try to find a partition of the aggregate type passed in for a given 3252/// offset and size. 3253/// 3254/// This recurses through the aggregate type and tries to compute a subtype 3255/// based on the offset and size. When the offset and size span a sub-section 3256/// of an array, it will even compute a new array type for that sub-section, 3257/// and the same for structs. 3258/// 3259/// Note that this routine is very strict and tries to find a partition of the 3260/// type which produces the *exact* right offset and size. It is not forgiving 3261/// when the size or offset cause either end of type-based partition to be off. 3262/// Also, this is a best-effort routine. It is reasonable to give up and not 3263/// return a type if necessary. 3264static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3265 uint64_t Offset, uint64_t Size) { 3266 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3267 return stripAggregateTypeWrapping(TD, Ty); 3268 if (Offset > TD.getTypeAllocSize(Ty) || 3269 (TD.getTypeAllocSize(Ty) - Offset) < Size) 3270 return 0; 3271 3272 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3273 // We can't partition pointers... 3274 if (SeqTy->isPointerTy()) 3275 return 0; 3276 3277 Type *ElementTy = SeqTy->getElementType(); 3278 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3279 uint64_t NumSkippedElements = Offset / ElementSize; 3280 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3281 if (NumSkippedElements >= ArrTy->getNumElements()) 3282 return 0; 3283 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3284 if (NumSkippedElements >= VecTy->getNumElements()) 3285 return 0; 3286 Offset -= NumSkippedElements * ElementSize; 3287 3288 // First check if we need to recurse. 3289 if (Offset > 0 || Size < ElementSize) { 3290 // Bail if the partition ends in a different array element. 3291 if ((Offset + Size) > ElementSize) 3292 return 0; 3293 // Recurse through the element type trying to peel off offset bytes. 3294 return getTypePartition(TD, ElementTy, Offset, Size); 3295 } 3296 assert(Offset == 0); 3297 3298 if (Size == ElementSize) 3299 return stripAggregateTypeWrapping(TD, ElementTy); 3300 assert(Size > ElementSize); 3301 uint64_t NumElements = Size / ElementSize; 3302 if (NumElements * ElementSize != Size) 3303 return 0; 3304 return ArrayType::get(ElementTy, NumElements); 3305 } 3306 3307 StructType *STy = dyn_cast<StructType>(Ty); 3308 if (!STy) 3309 return 0; 3310 3311 const StructLayout *SL = TD.getStructLayout(STy); 3312 if (Offset >= SL->getSizeInBytes()) 3313 return 0; 3314 uint64_t EndOffset = Offset + Size; 3315 if (EndOffset > SL->getSizeInBytes()) 3316 return 0; 3317 3318 unsigned Index = SL->getElementContainingOffset(Offset); 3319 Offset -= SL->getElementOffset(Index); 3320 3321 Type *ElementTy = STy->getElementType(Index); 3322 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3323 if (Offset >= ElementSize) 3324 return 0; // The offset points into alignment padding. 3325 3326 // See if any partition must be contained by the element. 3327 if (Offset > 0 || Size < ElementSize) { 3328 if ((Offset + Size) > ElementSize) 3329 return 0; 3330 return getTypePartition(TD, ElementTy, Offset, Size); 3331 } 3332 assert(Offset == 0); 3333 3334 if (Size == ElementSize) 3335 return stripAggregateTypeWrapping(TD, ElementTy); 3336 3337 StructType::element_iterator EI = STy->element_begin() + Index, 3338 EE = STy->element_end(); 3339 if (EndOffset < SL->getSizeInBytes()) { 3340 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3341 if (Index == EndIndex) 3342 return 0; // Within a single element and its padding. 3343 3344 // Don't try to form "natural" types if the elements don't line up with the 3345 // expected size. 3346 // FIXME: We could potentially recurse down through the last element in the 3347 // sub-struct to find a natural end point. 3348 if (SL->getElementOffset(EndIndex) != EndOffset) 3349 return 0; 3350 3351 assert(Index < EndIndex); 3352 EE = STy->element_begin() + EndIndex; 3353 } 3354 3355 // Try to build up a sub-structure. 3356 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3357 STy->isPacked()); 3358 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3359 if (Size != SubSL->getSizeInBytes()) 3360 return 0; // The sub-struct doesn't have quite the size needed. 3361 3362 return SubTy; 3363} 3364 3365/// \brief Rewrite an alloca partition's users. 3366/// 3367/// This routine drives both of the rewriting goals of the SROA pass. It tries 3368/// to rewrite uses of an alloca partition to be conducive for SSA value 3369/// promotion. If the partition needs a new, more refined alloca, this will 3370/// build that new alloca, preserving as much type information as possible, and 3371/// rewrite the uses of the old alloca to point at the new one and have the 3372/// appropriate new offsets. It also evaluates how successful the rewrite was 3373/// at enabling promotion and if it was successful queues the alloca to be 3374/// promoted. 3375bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3376 AllocaPartitioning &P, 3377 AllocaPartitioning::iterator PI) { 3378 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3379 bool IsLive = false; 3380 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3381 UE = P.use_end(PI); 3382 UI != UE && !IsLive; ++UI) 3383 if (UI->U) 3384 IsLive = true; 3385 if (!IsLive) 3386 return false; // No live uses left of this partition. 3387 3388 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3389 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3390 3391 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3392 DEBUG(dbgs() << " speculating "); 3393 DEBUG(P.print(dbgs(), PI, "")); 3394 Speculator.visitUsers(PI); 3395 3396 // Try to compute a friendly type for this partition of the alloca. This 3397 // won't always succeed, in which case we fall back to a legal integer type 3398 // or an i8 array of an appropriate size. 3399 Type *AllocaTy = 0; 3400 if (Type *PartitionTy = P.getCommonType(PI)) 3401 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3402 AllocaTy = PartitionTy; 3403 if (!AllocaTy) 3404 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3405 PI->BeginOffset, AllocaSize)) 3406 AllocaTy = PartitionTy; 3407 if ((!AllocaTy || 3408 (AllocaTy->isArrayTy() && 3409 AllocaTy->getArrayElementType()->isIntegerTy())) && 3410 TD->isLegalInteger(AllocaSize * 8)) 3411 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3412 if (!AllocaTy) 3413 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3414 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3415 3416 // Check for the case where we're going to rewrite to a new alloca of the 3417 // exact same type as the original, and with the same access offsets. In that 3418 // case, re-use the existing alloca, but still run through the rewriter to 3419 // perform phi and select speculation. 3420 AllocaInst *NewAI; 3421 if (AllocaTy == AI.getAllocatedType()) { 3422 assert(PI->BeginOffset == 0 && 3423 "Non-zero begin offset but same alloca type"); 3424 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3425 NewAI = &AI; 3426 } else { 3427 unsigned Alignment = AI.getAlignment(); 3428 if (!Alignment) { 3429 // The minimum alignment which users can rely on when the explicit 3430 // alignment is omitted or zero is that required by the ABI for this 3431 // type. 3432 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3433 } 3434 Alignment = MinAlign(Alignment, PI->BeginOffset); 3435 // If we will get at least this much alignment from the type alone, leave 3436 // the alloca's alignment unconstrained. 3437 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3438 Alignment = 0; 3439 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3440 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3441 &AI); 3442 ++NumNewAllocas; 3443 } 3444 3445 DEBUG(dbgs() << "Rewriting alloca partition " 3446 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3447 << *NewAI << "\n"); 3448 3449 // Track the high watermark of the post-promotion worklist. We will reset it 3450 // to this point if the alloca is not in fact scheduled for promotion. 3451 unsigned PPWOldSize = PostPromotionWorklist.size(); 3452 3453 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3454 PI->BeginOffset, PI->EndOffset); 3455 DEBUG(dbgs() << " rewriting "); 3456 DEBUG(P.print(dbgs(), PI, "")); 3457 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3458 if (Promotable) { 3459 DEBUG(dbgs() << " and queuing for promotion\n"); 3460 PromotableAllocas.push_back(NewAI); 3461 } else if (NewAI != &AI) { 3462 // If we can't promote the alloca, iterate on it to check for new 3463 // refinements exposed by splitting the current alloca. Don't iterate on an 3464 // alloca which didn't actually change and didn't get promoted. 3465 Worklist.insert(NewAI); 3466 } 3467 3468 // Drop any post-promotion work items if promotion didn't happen. 3469 if (!Promotable) 3470 while (PostPromotionWorklist.size() > PPWOldSize) 3471 PostPromotionWorklist.pop_back(); 3472 3473 return true; 3474} 3475 3476/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3477bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3478 bool Changed = false; 3479 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3480 ++PI) 3481 Changed |= rewriteAllocaPartition(AI, P, PI); 3482 3483 return Changed; 3484} 3485 3486/// \brief Analyze an alloca for SROA. 3487/// 3488/// This analyzes the alloca to ensure we can reason about it, builds 3489/// a partitioning of the alloca, and then hands it off to be split and 3490/// rewritten as needed. 3491bool SROA::runOnAlloca(AllocaInst &AI) { 3492 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3493 ++NumAllocasAnalyzed; 3494 3495 // Special case dead allocas, as they're trivial. 3496 if (AI.use_empty()) { 3497 AI.eraseFromParent(); 3498 return true; 3499 } 3500 3501 // Skip alloca forms that this analysis can't handle. 3502 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3503 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3504 return false; 3505 3506 bool Changed = false; 3507 3508 // First, split any FCA loads and stores touching this alloca to promote 3509 // better splitting and promotion opportunities. 3510 AggLoadStoreRewriter AggRewriter(*TD); 3511 Changed |= AggRewriter.rewrite(AI); 3512 3513 // Build the partition set using a recursive instruction-visiting builder. 3514 AllocaPartitioning P(*TD, AI); 3515 DEBUG(P.print(dbgs())); 3516 if (P.isEscaped()) 3517 return Changed; 3518 3519 // Delete all the dead users of this alloca before splitting and rewriting it. 3520 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3521 DE = P.dead_user_end(); 3522 DI != DE; ++DI) { 3523 Changed = true; 3524 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3525 DeadInsts.insert(*DI); 3526 } 3527 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3528 DE = P.dead_op_end(); 3529 DO != DE; ++DO) { 3530 Value *OldV = **DO; 3531 // Clobber the use with an undef value. 3532 **DO = UndefValue::get(OldV->getType()); 3533 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3534 if (isInstructionTriviallyDead(OldI)) { 3535 Changed = true; 3536 DeadInsts.insert(OldI); 3537 } 3538 } 3539 3540 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3541 if (P.begin() == P.end()) 3542 return Changed; 3543 3544 return splitAlloca(AI, P) || Changed; 3545} 3546 3547/// \brief Delete the dead instructions accumulated in this run. 3548/// 3549/// Recursively deletes the dead instructions we've accumulated. This is done 3550/// at the very end to maximize locality of the recursive delete and to 3551/// minimize the problems of invalidated instruction pointers as such pointers 3552/// are used heavily in the intermediate stages of the algorithm. 3553/// 3554/// We also record the alloca instructions deleted here so that they aren't 3555/// subsequently handed to mem2reg to promote. 3556void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3557 while (!DeadInsts.empty()) { 3558 Instruction *I = DeadInsts.pop_back_val(); 3559 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3560 3561 I->replaceAllUsesWith(UndefValue::get(I->getType())); 3562 3563 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3564 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3565 // Zero out the operand and see if it becomes trivially dead. 3566 *OI = 0; 3567 if (isInstructionTriviallyDead(U)) 3568 DeadInsts.insert(U); 3569 } 3570 3571 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3572 DeletedAllocas.insert(AI); 3573 3574 ++NumDeleted; 3575 I->eraseFromParent(); 3576 } 3577} 3578 3579/// \brief Promote the allocas, using the best available technique. 3580/// 3581/// This attempts to promote whatever allocas have been identified as viable in 3582/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3583/// If there is a domtree available, we attempt to promote using the full power 3584/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3585/// based on the SSAUpdater utilities. This function returns whether any 3586/// promotion occurred. 3587bool SROA::promoteAllocas(Function &F) { 3588 if (PromotableAllocas.empty()) 3589 return false; 3590 3591 NumPromoted += PromotableAllocas.size(); 3592 3593 if (DT && !ForceSSAUpdater) { 3594 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3595 PromoteMemToReg(PromotableAllocas, *DT); 3596 PromotableAllocas.clear(); 3597 return true; 3598 } 3599 3600 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3601 SSAUpdater SSA; 3602 DIBuilder DIB(*F.getParent()); 3603 SmallVector<Instruction*, 64> Insts; 3604 3605 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3606 AllocaInst *AI = PromotableAllocas[Idx]; 3607 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3608 UI != UE;) { 3609 Instruction *I = cast<Instruction>(*UI++); 3610 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3611 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3612 // leading to them) here. Eventually it should use them to optimize the 3613 // scalar values produced. 3614 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3615 assert(onlyUsedByLifetimeMarkers(I) && 3616 "Found a bitcast used outside of a lifetime marker."); 3617 while (!I->use_empty()) 3618 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3619 I->eraseFromParent(); 3620 continue; 3621 } 3622 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3623 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3624 II->getIntrinsicID() == Intrinsic::lifetime_end); 3625 II->eraseFromParent(); 3626 continue; 3627 } 3628 3629 Insts.push_back(I); 3630 } 3631 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3632 Insts.clear(); 3633 } 3634 3635 PromotableAllocas.clear(); 3636 return true; 3637} 3638 3639namespace { 3640 /// \brief A predicate to test whether an alloca belongs to a set. 3641 class IsAllocaInSet { 3642 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3643 const SetType &Set; 3644 3645 public: 3646 typedef AllocaInst *argument_type; 3647 3648 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3649 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3650 }; 3651} 3652 3653bool SROA::runOnFunction(Function &F) { 3654 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3655 C = &F.getContext(); 3656 TD = getAnalysisIfAvailable<DataLayout>(); 3657 if (!TD) { 3658 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3659 return false; 3660 } 3661 DT = getAnalysisIfAvailable<DominatorTree>(); 3662 3663 BasicBlock &EntryBB = F.getEntryBlock(); 3664 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3665 I != E; ++I) 3666 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3667 Worklist.insert(AI); 3668 3669 bool Changed = false; 3670 // A set of deleted alloca instruction pointers which should be removed from 3671 // the list of promotable allocas. 3672 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3673 3674 do { 3675 while (!Worklist.empty()) { 3676 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3677 deleteDeadInstructions(DeletedAllocas); 3678 3679 // Remove the deleted allocas from various lists so that we don't try to 3680 // continue processing them. 3681 if (!DeletedAllocas.empty()) { 3682 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3683 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3684 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3685 PromotableAllocas.end(), 3686 IsAllocaInSet(DeletedAllocas)), 3687 PromotableAllocas.end()); 3688 DeletedAllocas.clear(); 3689 } 3690 } 3691 3692 Changed |= promoteAllocas(F); 3693 3694 Worklist = PostPromotionWorklist; 3695 PostPromotionWorklist.clear(); 3696 } while (!Worklist.empty()); 3697 3698 return Changed; 3699} 3700 3701void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3702 if (RequiresDomTree) 3703 AU.addRequired<DominatorTree>(); 3704 AU.setPreservesCFG(); 3705} 3706