SROA.cpp revision 225d25de490386dd6398842c19f567b921a97db6
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/ADT/STLExtras.h" 29#include "llvm/ADT/SetVector.h" 30#include "llvm/ADT/SmallVector.h" 31#include "llvm/ADT/Statistic.h" 32#include "llvm/Analysis/Dominators.h" 33#include "llvm/Analysis/Loads.h" 34#include "llvm/Analysis/PtrUseVisitor.h" 35#include "llvm/Analysis/ValueTracking.h" 36#include "llvm/Constants.h" 37#include "llvm/DIBuilder.h" 38#include "llvm/DataLayout.h" 39#include "llvm/DebugInfo.h" 40#include "llvm/DerivedTypes.h" 41#include "llvm/Function.h" 42#include "llvm/IRBuilder.h" 43#include "llvm/InstVisitor.h" 44#include "llvm/Instructions.h" 45#include "llvm/IntrinsicInst.h" 46#include "llvm/LLVMContext.h" 47#include "llvm/Module.h" 48#include "llvm/Operator.h" 49#include "llvm/Pass.h" 50#include "llvm/Support/CommandLine.h" 51#include "llvm/Support/Debug.h" 52#include "llvm/Support/ErrorHandling.h" 53#include "llvm/Support/GetElementPtrTypeIterator.h" 54#include "llvm/Support/MathExtras.h" 55#include "llvm/Support/raw_ostream.h" 56#include "llvm/Transforms/Utils/Local.h" 57#include "llvm/Transforms/Utils/PromoteMemToReg.h" 58#include "llvm/Transforms/Utils/SSAUpdater.h" 59using namespace llvm; 60 61STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 62STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 63STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 64STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 65STATISTIC(NumDeleted, "Number of instructions deleted"); 66STATISTIC(NumVectorized, "Number of vectorized aggregates"); 67 68/// Hidden option to force the pass to not use DomTree and mem2reg, instead 69/// forming SSA values through the SSAUpdater infrastructure. 70static cl::opt<bool> 71ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 72 73namespace { 74/// \brief Alloca partitioning representation. 75/// 76/// This class represents a partitioning of an alloca into slices, and 77/// information about the nature of uses of each slice of the alloca. The goal 78/// is that this information is sufficient to decide if and how to split the 79/// alloca apart and replace slices with scalars. It is also intended that this 80/// structure can capture the relevant information needed both to decide about 81/// and to enact these transformations. 82class AllocaPartitioning { 83public: 84 /// \brief A common base class for representing a half-open byte range. 85 struct ByteRange { 86 /// \brief The beginning offset of the range. 87 uint64_t BeginOffset; 88 89 /// \brief The ending offset, not included in the range. 90 uint64_t EndOffset; 91 92 ByteRange() : BeginOffset(), EndOffset() {} 93 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 94 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 95 96 /// \brief Support for ordering ranges. 97 /// 98 /// This provides an ordering over ranges such that start offsets are 99 /// always increasing, and within equal start offsets, the end offsets are 100 /// decreasing. Thus the spanning range comes first in a cluster with the 101 /// same start position. 102 bool operator<(const ByteRange &RHS) const { 103 if (BeginOffset < RHS.BeginOffset) return true; 104 if (BeginOffset > RHS.BeginOffset) return false; 105 if (EndOffset > RHS.EndOffset) return true; 106 return false; 107 } 108 109 /// \brief Support comparison with a single offset to allow binary searches. 110 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 111 return LHS.BeginOffset < RHSOffset; 112 } 113 114 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 115 const ByteRange &RHS) { 116 return LHSOffset < RHS.BeginOffset; 117 } 118 119 bool operator==(const ByteRange &RHS) const { 120 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 121 } 122 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 123 }; 124 125 /// \brief A partition of an alloca. 126 /// 127 /// This structure represents a contiguous partition of the alloca. These are 128 /// formed by examining the uses of the alloca. During formation, they may 129 /// overlap but once an AllocaPartitioning is built, the Partitions within it 130 /// are all disjoint. 131 struct Partition : public ByteRange { 132 /// \brief Whether this partition is splittable into smaller partitions. 133 /// 134 /// We flag partitions as splittable when they are formed entirely due to 135 /// accesses by trivially splittable operations such as memset and memcpy. 136 bool IsSplittable; 137 138 /// \brief Test whether a partition has been marked as dead. 139 bool isDead() const { 140 if (BeginOffset == UINT64_MAX) { 141 assert(EndOffset == UINT64_MAX); 142 return true; 143 } 144 return false; 145 } 146 147 /// \brief Kill a partition. 148 /// This is accomplished by setting both its beginning and end offset to 149 /// the maximum possible value. 150 void kill() { 151 assert(!isDead() && "He's Dead, Jim!"); 152 BeginOffset = EndOffset = UINT64_MAX; 153 } 154 155 Partition() : ByteRange(), IsSplittable() {} 156 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 157 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 158 }; 159 160 /// \brief A particular use of a partition of the alloca. 161 /// 162 /// This structure is used to associate uses of a partition with it. They 163 /// mark the range of bytes which are referenced by a particular instruction, 164 /// and includes a handle to the user itself and the pointer value in use. 165 /// The bounds of these uses are determined by intersecting the bounds of the 166 /// memory use itself with a particular partition. As a consequence there is 167 /// intentionally overlap between various uses of the same partition. 168 struct PartitionUse : public ByteRange { 169 /// \brief The use in question. Provides access to both user and used value. 170 /// 171 /// Note that this may be null if the partition use is *dead*, that is, it 172 /// should be ignored. 173 Use *U; 174 175 PartitionUse() : ByteRange(), U() {} 176 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) 177 : ByteRange(BeginOffset, EndOffset), U(U) {} 178 }; 179 180 /// \brief Construct a partitioning of a particular alloca. 181 /// 182 /// Construction does most of the work for partitioning the alloca. This 183 /// performs the necessary walks of users and builds a partitioning from it. 184 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 185 186 /// \brief Test whether a pointer to the allocation escapes our analysis. 187 /// 188 /// If this is true, the partitioning is never fully built and should be 189 /// ignored. 190 bool isEscaped() const { return PointerEscapingInstr; } 191 192 /// \brief Support for iterating over the partitions. 193 /// @{ 194 typedef SmallVectorImpl<Partition>::iterator iterator; 195 iterator begin() { return Partitions.begin(); } 196 iterator end() { return Partitions.end(); } 197 198 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 199 const_iterator begin() const { return Partitions.begin(); } 200 const_iterator end() const { return Partitions.end(); } 201 /// @} 202 203 /// \brief Support for iterating over and manipulating a particular 204 /// partition's uses. 205 /// 206 /// The iteration support provided for uses is more limited, but also 207 /// includes some manipulation routines to support rewriting the uses of 208 /// partitions during SROA. 209 /// @{ 210 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 211 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 212 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 213 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 214 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 215 216 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 217 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 218 const_use_iterator use_begin(const_iterator I) const { 219 return Uses[I - begin()].begin(); 220 } 221 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 222 const_use_iterator use_end(const_iterator I) const { 223 return Uses[I - begin()].end(); 224 } 225 226 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 227 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 228 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 229 return Uses[PIdx][UIdx]; 230 } 231 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 232 return Uses[I - begin()][UIdx]; 233 } 234 235 void use_push_back(unsigned Idx, const PartitionUse &PU) { 236 Uses[Idx].push_back(PU); 237 } 238 void use_push_back(const_iterator I, const PartitionUse &PU) { 239 Uses[I - begin()].push_back(PU); 240 } 241 /// @} 242 243 /// \brief Allow iterating the dead users for this alloca. 244 /// 245 /// These are instructions which will never actually use the alloca as they 246 /// are outside the allocated range. They are safe to replace with undef and 247 /// delete. 248 /// @{ 249 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 250 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 251 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 252 /// @} 253 254 /// \brief Allow iterating the dead expressions referring to this alloca. 255 /// 256 /// These are operands which have cannot actually be used to refer to the 257 /// alloca as they are outside its range and the user doesn't correct for 258 /// that. These mostly consist of PHI node inputs and the like which we just 259 /// need to replace with undef. 260 /// @{ 261 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 262 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 263 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 264 /// @} 265 266 /// \brief MemTransferInst auxiliary data. 267 /// This struct provides some auxiliary data about memory transfer 268 /// intrinsics such as memcpy and memmove. These intrinsics can use two 269 /// different ranges within the same alloca, and provide other challenges to 270 /// correctly represent. We stash extra data to help us untangle this 271 /// after the partitioning is complete. 272 struct MemTransferOffsets { 273 /// The destination begin and end offsets when the destination is within 274 /// this alloca. If the end offset is zero the destination is not within 275 /// this alloca. 276 uint64_t DestBegin, DestEnd; 277 278 /// The source begin and end offsets when the source is within this alloca. 279 /// If the end offset is zero, the source is not within this alloca. 280 uint64_t SourceBegin, SourceEnd; 281 282 /// Flag for whether an alloca is splittable. 283 bool IsSplittable; 284 }; 285 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 286 return MemTransferInstData.lookup(&II); 287 } 288 289 /// \brief Map from a PHI or select operand back to a partition. 290 /// 291 /// When manipulating PHI nodes or selects, they can use more than one 292 /// partition of an alloca. We store a special mapping to allow finding the 293 /// partition referenced by each of these operands, if any. 294 iterator findPartitionForPHIOrSelectOperand(Use *U) { 295 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 296 = PHIOrSelectOpMap.find(U); 297 if (MapIt == PHIOrSelectOpMap.end()) 298 return end(); 299 300 return begin() + MapIt->second.first; 301 } 302 303 /// \brief Map from a PHI or select operand back to the specific use of 304 /// a partition. 305 /// 306 /// Similar to mapping these operands back to the partitions, this maps 307 /// directly to the use structure of that partition. 308 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 309 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 310 = PHIOrSelectOpMap.find(U); 311 assert(MapIt != PHIOrSelectOpMap.end()); 312 return Uses[MapIt->second.first].begin() + MapIt->second.second; 313 } 314 315 /// \brief Compute a common type among the uses of a particular partition. 316 /// 317 /// This routines walks all of the uses of a particular partition and tries 318 /// to find a common type between them. Untyped operations such as memset and 319 /// memcpy are ignored. 320 Type *getCommonType(iterator I) const; 321 322#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 323 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 324 void printUsers(raw_ostream &OS, const_iterator I, 325 StringRef Indent = " ") const; 326 void print(raw_ostream &OS) const; 327 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 328 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 329#endif 330 331private: 332 template <typename DerivedT, typename RetT = void> class BuilderBase; 333 class PartitionBuilder; 334 friend class AllocaPartitioning::PartitionBuilder; 335 class UseBuilder; 336 friend class AllocaPartitioning::UseBuilder; 337 338#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 339 /// \brief Handle to alloca instruction to simplify method interfaces. 340 AllocaInst &AI; 341#endif 342 343 /// \brief The instruction responsible for this alloca having no partitioning. 344 /// 345 /// When an instruction (potentially) escapes the pointer to the alloca, we 346 /// store a pointer to that here and abort trying to partition the alloca. 347 /// This will be null if the alloca is partitioned successfully. 348 Instruction *PointerEscapingInstr; 349 350 /// \brief The partitions of the alloca. 351 /// 352 /// We store a vector of the partitions over the alloca here. This vector is 353 /// sorted by increasing begin offset, and then by decreasing end offset. See 354 /// the Partition inner class for more details. Initially (during 355 /// construction) there are overlaps, but we form a disjoint sequence of 356 /// partitions while finishing construction and a fully constructed object is 357 /// expected to always have this as a disjoint space. 358 SmallVector<Partition, 8> Partitions; 359 360 /// \brief The uses of the partitions. 361 /// 362 /// This is essentially a mapping from each partition to a list of uses of 363 /// that partition. The mapping is done with a Uses vector that has the exact 364 /// same number of entries as the partition vector. Each entry is itself 365 /// a vector of the uses. 366 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 367 368 /// \brief Instructions which will become dead if we rewrite the alloca. 369 /// 370 /// Note that these are not separated by partition. This is because we expect 371 /// a partitioned alloca to be completely rewritten or not rewritten at all. 372 /// If rewritten, all these instructions can simply be removed and replaced 373 /// with undef as they come from outside of the allocated space. 374 SmallVector<Instruction *, 8> DeadUsers; 375 376 /// \brief Operands which will become dead if we rewrite the alloca. 377 /// 378 /// These are operands that in their particular use can be replaced with 379 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 380 /// to PHI nodes and the like. They aren't entirely dead (there might be 381 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 382 /// want to swap this particular input for undef to simplify the use lists of 383 /// the alloca. 384 SmallVector<Use *, 8> DeadOperands; 385 386 /// \brief The underlying storage for auxiliary memcpy and memset info. 387 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 388 389 /// \brief A side datastructure used when building up the partitions and uses. 390 /// 391 /// This mapping is only really used during the initial building of the 392 /// partitioning so that we can retain information about PHI and select nodes 393 /// processed. 394 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 395 396 /// \brief Auxiliary information for particular PHI or select operands. 397 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 398 399 /// \brief A utility routine called from the constructor. 400 /// 401 /// This does what it says on the tin. It is the key of the alloca partition 402 /// splitting and merging. After it is called we have the desired disjoint 403 /// collection of partitions. 404 void splitAndMergePartitions(); 405}; 406} 407 408static Value *foldSelectInst(SelectInst &SI) { 409 // If the condition being selected on is a constant or the same value is 410 // being selected between, fold the select. Yes this does (rarely) happen 411 // early on. 412 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 413 return SI.getOperand(1+CI->isZero()); 414 if (SI.getOperand(1) == SI.getOperand(2)) { 415 return SI.getOperand(1); 416 } 417 return 0; 418} 419 420/// \brief Builder for the alloca partitioning. 421/// 422/// This class builds an alloca partitioning by recursively visiting the uses 423/// of an alloca and splitting the partitions for each load and store at each 424/// offset. 425class AllocaPartitioning::PartitionBuilder 426 : public PtrUseVisitor<PartitionBuilder> { 427 friend class PtrUseVisitor<PartitionBuilder>; 428 friend class InstVisitor<PartitionBuilder>; 429 typedef PtrUseVisitor<PartitionBuilder> Base; 430 431 const uint64_t AllocSize; 432 AllocaPartitioning &P; 433 434 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 435 436public: 437 PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) 438 : PtrUseVisitor<PartitionBuilder>(DL), 439 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), 440 P(P) {} 441 442private: 443 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 444 bool IsSplittable = false) { 445 // Completely skip uses which have a zero size or start either before or 446 // past the end of the allocation. 447 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { 448 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 449 << " which has zero size or starts outside of the " 450 << AllocSize << " byte alloca:\n" 451 << " alloca: " << P.AI << "\n" 452 << " use: " << I << "\n"); 453 return; 454 } 455 456 uint64_t BeginOffset = Offset.getZExtValue(); 457 uint64_t EndOffset = BeginOffset + Size; 458 459 // Clamp the end offset to the end of the allocation. Note that this is 460 // formulated to handle even the case where "BeginOffset + Size" overflows. 461 // NOTE! This may appear superficially to be something we could ignore 462 // entirely, but that is not so! There may be PHI-node uses where some 463 // instructions are dead but not others. We can't completely ignore the 464 // PHI node, and so have to record at least the information here. 465 assert(AllocSize >= BeginOffset); // Established above. 466 if (Size > AllocSize - BeginOffset) { 467 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 468 << " to remain within the " << AllocSize << " byte alloca:\n" 469 << " alloca: " << P.AI << "\n" 470 << " use: " << I << "\n"); 471 EndOffset = AllocSize; 472 } 473 474 Partition New(BeginOffset, EndOffset, IsSplittable); 475 P.Partitions.push_back(New); 476 } 477 478 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 479 bool IsVolatile) { 480 uint64_t Size = DL.getTypeStoreSize(Ty); 481 482 // If this memory access can be shown to *statically* extend outside the 483 // bounds of of the allocation, it's behavior is undefined, so simply 484 // ignore it. Note that this is more strict than the generic clamping 485 // behavior of insertUse. We also try to handle cases which might run the 486 // risk of overflow. 487 // FIXME: We should instead consider the pointer to have escaped if this 488 // function is being instrumented for addressing bugs or race conditions. 489 if (Offset.isNegative() || Size > AllocSize || 490 Offset.ugt(AllocSize - Size)) { 491 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 492 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 493 << " which extends past the end of the " << AllocSize 494 << " byte alloca:\n" 495 << " alloca: " << P.AI << "\n" 496 << " use: " << I << "\n"); 497 return; 498 } 499 500 // We allow splitting of loads and stores where the type is an integer type 501 // and which cover the entire alloca. Such integer loads and stores 502 // often require decomposition into fine grained loads and stores. 503 bool IsSplittable = false; 504 if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) 505 IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; 506 507 insertUse(I, Offset, Size, IsSplittable); 508 } 509 510 void visitLoadInst(LoadInst &LI) { 511 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 512 "All simple FCA loads should have been pre-split"); 513 514 if (!IsOffsetKnown) 515 return PI.setAborted(&LI); 516 517 return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); 518 } 519 520 void visitStoreInst(StoreInst &SI) { 521 Value *ValOp = SI.getValueOperand(); 522 if (ValOp == *U) 523 return PI.setEscapedAndAborted(&SI); 524 if (!IsOffsetKnown) 525 return PI.setAborted(&SI); 526 527 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 528 "All simple FCA stores should have been pre-split"); 529 handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); 530 } 531 532 533 void visitMemSetInst(MemSetInst &II) { 534 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 535 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 536 if ((Length && Length->getValue() == 0) || 537 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 538 // Zero-length mem transfer intrinsics can be ignored entirely. 539 return; 540 541 if (!IsOffsetKnown) 542 return PI.setAborted(&II); 543 544 insertUse(II, Offset, 545 Length ? Length->getLimitedValue() 546 : AllocSize - Offset.getLimitedValue(), 547 (bool)Length); 548 } 549 550 void visitMemTransferInst(MemTransferInst &II) { 551 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 552 if ((Length && Length->getValue() == 0) || 553 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 554 // Zero-length mem transfer intrinsics can be ignored entirely. 555 return; 556 557 if (!IsOffsetKnown) 558 return PI.setAborted(&II); 559 560 uint64_t RawOffset = Offset.getLimitedValue(); 561 uint64_t Size = Length ? Length->getLimitedValue() 562 : AllocSize - RawOffset; 563 564 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 565 566 // Only intrinsics with a constant length can be split. 567 Offsets.IsSplittable = Length; 568 569 if (*U == II.getRawDest()) { 570 Offsets.DestBegin = RawOffset; 571 Offsets.DestEnd = RawOffset + Size; 572 } 573 if (*U == II.getRawSource()) { 574 Offsets.SourceBegin = RawOffset; 575 Offsets.SourceEnd = RawOffset + Size; 576 } 577 578 // If we have set up end offsets for both the source and the destination, 579 // we have found both sides of this transfer pointing at the same alloca. 580 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 581 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 582 unsigned PrevIdx = MemTransferPartitionMap[&II]; 583 584 // Check if the begin offsets match and this is a non-volatile transfer. 585 // In that case, we can completely elide the transfer. 586 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 587 P.Partitions[PrevIdx].kill(); 588 return; 589 } 590 591 // Otherwise we have an offset transfer within the same alloca. We can't 592 // split those. 593 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 594 } else if (SeenBothEnds) { 595 // Handle the case where this exact use provides both ends of the 596 // operation. 597 assert(II.getRawDest() == II.getRawSource()); 598 599 // For non-volatile transfers this is a no-op. 600 if (!II.isVolatile()) 601 return; 602 603 // Otherwise just suppress splitting. 604 Offsets.IsSplittable = false; 605 } 606 607 608 // Insert the use now that we've fixed up the splittable nature. 609 insertUse(II, Offset, Size, Offsets.IsSplittable); 610 611 // Setup the mapping from intrinsic to partition of we've not seen both 612 // ends of this transfer. 613 if (!SeenBothEnds) { 614 unsigned NewIdx = P.Partitions.size() - 1; 615 bool Inserted 616 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 617 assert(Inserted && 618 "Already have intrinsic in map but haven't seen both ends"); 619 (void)Inserted; 620 } 621 } 622 623 // Disable SRoA for any intrinsics except for lifetime invariants. 624 // FIXME: What about debug instrinsics? This matches old behavior, but 625 // doesn't make sense. 626 void visitIntrinsicInst(IntrinsicInst &II) { 627 if (!IsOffsetKnown) 628 return PI.setAborted(&II); 629 630 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 631 II.getIntrinsicID() == Intrinsic::lifetime_end) { 632 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 633 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 634 Length->getLimitedValue()); 635 insertUse(II, Offset, Size, true); 636 return; 637 } 638 639 Base::visitIntrinsicInst(II); 640 } 641 642 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 643 // We consider any PHI or select that results in a direct load or store of 644 // the same offset to be a viable use for partitioning purposes. These uses 645 // are considered unsplittable and the size is the maximum loaded or stored 646 // size. 647 SmallPtrSet<Instruction *, 4> Visited; 648 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 649 Visited.insert(Root); 650 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 651 // If there are no loads or stores, the access is dead. We mark that as 652 // a size zero access. 653 Size = 0; 654 do { 655 Instruction *I, *UsedI; 656 llvm::tie(UsedI, I) = Uses.pop_back_val(); 657 658 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 659 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 660 continue; 661 } 662 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 663 Value *Op = SI->getOperand(0); 664 if (Op == UsedI) 665 return SI; 666 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 667 continue; 668 } 669 670 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 671 if (!GEP->hasAllZeroIndices()) 672 return GEP; 673 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 674 !isa<SelectInst>(I)) { 675 return I; 676 } 677 678 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 679 ++UI) 680 if (Visited.insert(cast<Instruction>(*UI))) 681 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 682 } while (!Uses.empty()); 683 684 return 0; 685 } 686 687 void visitPHINode(PHINode &PN) { 688 if (PN.use_empty()) 689 return; 690 if (!IsOffsetKnown) 691 return PI.setAborted(&PN); 692 693 // See if we already have computed info on this node. 694 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 695 if (PHIInfo.first) { 696 PHIInfo.second = true; 697 insertUse(PN, Offset, PHIInfo.first); 698 return; 699 } 700 701 // Check for an unsafe use of the PHI node. 702 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 703 return PI.setAborted(UnsafeI); 704 705 insertUse(PN, Offset, PHIInfo.first); 706 } 707 708 void visitSelectInst(SelectInst &SI) { 709 if (SI.use_empty()) 710 return; 711 if (Value *Result = foldSelectInst(SI)) { 712 if (Result == *U) 713 // If the result of the constant fold will be the pointer, recurse 714 // through the select as if we had RAUW'ed it. 715 enqueueUsers(SI); 716 717 return; 718 } 719 if (!IsOffsetKnown) 720 return PI.setAborted(&SI); 721 722 // See if we already have computed info on this node. 723 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 724 if (SelectInfo.first) { 725 SelectInfo.second = true; 726 insertUse(SI, Offset, SelectInfo.first); 727 return; 728 } 729 730 // Check for an unsafe use of the PHI node. 731 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 732 return PI.setAborted(UnsafeI); 733 734 insertUse(SI, Offset, SelectInfo.first); 735 } 736 737 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 738 void visitInstruction(Instruction &I) { 739 PI.setAborted(&I); 740 } 741}; 742 743/// \brief Use adder for the alloca partitioning. 744/// 745/// This class adds the uses of an alloca to all of the partitions which they 746/// use. For splittable partitions, this can end up doing essentially a linear 747/// walk of the partitions, but the number of steps remains bounded by the 748/// total result instruction size: 749/// - The number of partitions is a result of the number unsplittable 750/// instructions using the alloca. 751/// - The number of users of each partition is at worst the total number of 752/// splittable instructions using the alloca. 753/// Thus we will produce N * M instructions in the end, where N are the number 754/// of unsplittable uses and M are the number of splittable. This visitor does 755/// the exact same number of updates to the partitioning. 756/// 757/// In the more common case, this visitor will leverage the fact that the 758/// partition space is pre-sorted, and do a logarithmic search for the 759/// partition needed, making the total visit a classical ((N + M) * log(N)) 760/// complexity operation. 761class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { 762 friend class PtrUseVisitor<UseBuilder>; 763 friend class InstVisitor<UseBuilder>; 764 typedef PtrUseVisitor<UseBuilder> Base; 765 766 const uint64_t AllocSize; 767 AllocaPartitioning &P; 768 769 /// \brief Set to de-duplicate dead instructions found in the use walk. 770 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 771 772public: 773 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 774 : PtrUseVisitor<UseBuilder>(TD), 775 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 776 P(P) {} 777 778private: 779 void markAsDead(Instruction &I) { 780 if (VisitedDeadInsts.insert(&I)) 781 P.DeadUsers.push_back(&I); 782 } 783 784 void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { 785 // If the use has a zero size or extends outside of the allocation, record 786 // it as a dead use for elimination later. 787 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) 788 return markAsDead(User); 789 790 uint64_t BeginOffset = Offset.getZExtValue(); 791 uint64_t EndOffset = BeginOffset + Size; 792 793 // Clamp the end offset to the end of the allocation. Note that this is 794 // formulated to handle even the case where "BeginOffset + Size" overflows. 795 assert(AllocSize >= BeginOffset); // Established above. 796 if (Size > AllocSize - BeginOffset) 797 EndOffset = AllocSize; 798 799 // NB: This only works if we have zero overlapping partitions. 800 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 801 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 802 B = llvm::prior(B); 803 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 804 ++I) { 805 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 806 std::min(I->EndOffset, EndOffset), U); 807 P.use_push_back(I, NewPU); 808 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 809 P.PHIOrSelectOpMap[U] 810 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 811 } 812 } 813 814 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) { 815 uint64_t Size = DL.getTypeStoreSize(Ty); 816 817 // If this memory access can be shown to *statically* extend outside the 818 // bounds of of the allocation, it's behavior is undefined, so simply 819 // ignore it. Note that this is more strict than the generic clamping 820 // behavior of insertUse. 821 if (Offset.isNegative() || Size > AllocSize || 822 Offset.ugt(AllocSize - Size)) 823 return markAsDead(I); 824 825 insertUse(I, Offset, Size); 826 } 827 828 void visitBitCastInst(BitCastInst &BC) { 829 if (BC.use_empty()) 830 return markAsDead(BC); 831 832 return Base::visitBitCastInst(BC); 833 } 834 835 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 836 if (GEPI.use_empty()) 837 return markAsDead(GEPI); 838 839 return Base::visitGetElementPtrInst(GEPI); 840 } 841 842 void visitLoadInst(LoadInst &LI) { 843 assert(IsOffsetKnown); 844 handleLoadOrStore(LI.getType(), LI, Offset); 845 } 846 847 void visitStoreInst(StoreInst &SI) { 848 assert(IsOffsetKnown); 849 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 850 } 851 852 void visitMemSetInst(MemSetInst &II) { 853 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 854 if ((Length && Length->getValue() == 0) || 855 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 856 return markAsDead(II); 857 858 assert(IsOffsetKnown); 859 insertUse(II, Offset, Length ? Length->getLimitedValue() 860 : AllocSize - Offset.getLimitedValue()); 861 } 862 863 void visitMemTransferInst(MemTransferInst &II) { 864 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 865 if ((Length && Length->getValue() == 0) || 866 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 867 return markAsDead(II); 868 869 assert(IsOffsetKnown); 870 uint64_t Size = Length ? Length->getLimitedValue() 871 : AllocSize - Offset.getLimitedValue(); 872 873 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 874 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 875 Offsets.DestBegin == Offsets.SourceBegin) 876 return markAsDead(II); // Skip identity transfers without side-effects. 877 878 insertUse(II, Offset, Size); 879 } 880 881 void visitIntrinsicInst(IntrinsicInst &II) { 882 assert(IsOffsetKnown); 883 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 884 II.getIntrinsicID() == Intrinsic::lifetime_end); 885 886 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 887 insertUse(II, Offset, std::min(Length->getLimitedValue(), 888 AllocSize - Offset.getLimitedValue())); 889 } 890 891 void insertPHIOrSelect(Instruction &User, const APInt &Offset) { 892 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 893 894 // For PHI and select operands outside the alloca, we can't nuke the entire 895 // phi or select -- the other side might still be relevant, so we special 896 // case them here and use a separate structure to track the operands 897 // themselves which should be replaced with undef. 898 if ((Offset.isNegative() && Offset.uge(Size)) || 899 (!Offset.isNegative() && Offset.uge(AllocSize))) { 900 P.DeadOperands.push_back(U); 901 return; 902 } 903 904 insertUse(User, Offset, Size); 905 } 906 907 void visitPHINode(PHINode &PN) { 908 if (PN.use_empty()) 909 return markAsDead(PN); 910 911 assert(IsOffsetKnown); 912 insertPHIOrSelect(PN, Offset); 913 } 914 915 void visitSelectInst(SelectInst &SI) { 916 if (SI.use_empty()) 917 return markAsDead(SI); 918 919 if (Value *Result = foldSelectInst(SI)) { 920 if (Result == *U) 921 // If the result of the constant fold will be the pointer, recurse 922 // through the select as if we had RAUW'ed it. 923 enqueueUsers(SI); 924 else 925 // Otherwise the operand to the select is dead, and we can replace it 926 // with undef. 927 P.DeadOperands.push_back(U); 928 929 return; 930 } 931 932 assert(IsOffsetKnown); 933 insertPHIOrSelect(SI, Offset); 934 } 935 936 /// \brief Unreachable, we've already visited the alloca once. 937 void visitInstruction(Instruction &I) { 938 llvm_unreachable("Unhandled instruction in use builder."); 939 } 940}; 941 942void AllocaPartitioning::splitAndMergePartitions() { 943 size_t NumDeadPartitions = 0; 944 945 // Track the range of splittable partitions that we pass when accumulating 946 // overlapping unsplittable partitions. 947 uint64_t SplitEndOffset = 0ull; 948 949 Partition New(0ull, 0ull, false); 950 951 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 952 ++j; 953 954 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 955 assert(New.BeginOffset == New.EndOffset); 956 New = Partitions[i]; 957 } else { 958 assert(New.IsSplittable); 959 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 960 } 961 assert(New.BeginOffset != New.EndOffset); 962 963 // Scan the overlapping partitions. 964 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 965 // If the new partition we are forming is splittable, stop at the first 966 // unsplittable partition. 967 if (New.IsSplittable && !Partitions[j].IsSplittable) 968 break; 969 970 // Grow the new partition to include any equally splittable range. 'j' is 971 // always equally splittable when New is splittable, but when New is not 972 // splittable, we may subsume some (or part of some) splitable partition 973 // without growing the new one. 974 if (New.IsSplittable == Partitions[j].IsSplittable) { 975 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 976 } else { 977 assert(!New.IsSplittable); 978 assert(Partitions[j].IsSplittable); 979 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 980 } 981 982 Partitions[j].kill(); 983 ++NumDeadPartitions; 984 ++j; 985 } 986 987 // If the new partition is splittable, chop off the end as soon as the 988 // unsplittable subsequent partition starts and ensure we eventually cover 989 // the splittable area. 990 if (j != e && New.IsSplittable) { 991 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 992 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 993 } 994 995 // Add the new partition if it differs from the original one and is 996 // non-empty. We can end up with an empty partition here if it was 997 // splittable but there is an unsplittable one that starts at the same 998 // offset. 999 if (New != Partitions[i]) { 1000 if (New.BeginOffset != New.EndOffset) 1001 Partitions.push_back(New); 1002 // Mark the old one for removal. 1003 Partitions[i].kill(); 1004 ++NumDeadPartitions; 1005 } 1006 1007 New.BeginOffset = New.EndOffset; 1008 if (!New.IsSplittable) { 1009 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1010 if (j != e && !Partitions[j].IsSplittable) 1011 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1012 New.IsSplittable = true; 1013 // If there is a trailing splittable partition which won't be fused into 1014 // the next splittable partition go ahead and add it onto the partitions 1015 // list. 1016 if (New.BeginOffset < New.EndOffset && 1017 (j == e || !Partitions[j].IsSplittable || 1018 New.EndOffset < Partitions[j].BeginOffset)) { 1019 Partitions.push_back(New); 1020 New.BeginOffset = New.EndOffset = 0ull; 1021 } 1022 } 1023 } 1024 1025 // Re-sort the partitions now that they have been split and merged into 1026 // disjoint set of partitions. Also remove any of the dead partitions we've 1027 // replaced in the process. 1028 std::sort(Partitions.begin(), Partitions.end()); 1029 if (NumDeadPartitions) { 1030 assert(Partitions.back().isDead()); 1031 assert((ptrdiff_t)NumDeadPartitions == 1032 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1033 } 1034 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1035} 1036 1037AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1038 : 1039#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1040 AI(AI), 1041#endif 1042 PointerEscapingInstr(0) { 1043 PartitionBuilder PB(TD, AI, *this); 1044 PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1045 if (PtrI.isEscaped() || PtrI.isAborted()) { 1046 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1047 // possibly by just storing the PtrInfo in the AllocaPartitioning. 1048 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1049 : PtrI.getAbortingInst(); 1050 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1051 return; 1052 } 1053 1054 // Sort the uses. This arranges for the offsets to be in ascending order, 1055 // and the sizes to be in descending order. 1056 std::sort(Partitions.begin(), Partitions.end()); 1057 1058 // Remove any partitions from the back which are marked as dead. 1059 while (!Partitions.empty() && Partitions.back().isDead()) 1060 Partitions.pop_back(); 1061 1062 if (Partitions.size() > 1) { 1063 // Intersect splittability for all partitions with equal offsets and sizes. 1064 // Then remove all but the first so that we have a sequence of non-equal but 1065 // potentially overlapping partitions. 1066 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1067 I = J) { 1068 ++J; 1069 while (J != E && *I == *J) { 1070 I->IsSplittable &= J->IsSplittable; 1071 ++J; 1072 } 1073 } 1074 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1075 Partitions.end()); 1076 1077 // Split splittable and merge unsplittable partitions into a disjoint set 1078 // of partitions over the used space of the allocation. 1079 splitAndMergePartitions(); 1080 } 1081 1082 // Now build up the user lists for each of these disjoint partitions by 1083 // re-walking the recursive users of the alloca. 1084 Uses.resize(Partitions.size()); 1085 UseBuilder UB(TD, AI, *this); 1086 PtrI = UB.visitPtr(AI); 1087 assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); 1088 assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); 1089} 1090 1091Type *AllocaPartitioning::getCommonType(iterator I) const { 1092 Type *Ty = 0; 1093 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1094 if (!UI->U) 1095 continue; // Skip dead uses. 1096 if (isa<IntrinsicInst>(*UI->U->getUser())) 1097 continue; 1098 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1099 continue; 1100 1101 Type *UserTy = 0; 1102 if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { 1103 UserTy = LI->getType(); 1104 } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { 1105 UserTy = SI->getValueOperand()->getType(); 1106 } else { 1107 return 0; // Bail if we have weird uses. 1108 } 1109 1110 if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { 1111 // If the type is larger than the partition, skip it. We only encounter 1112 // this for split integer operations where we want to use the type of the 1113 // entity causing the split. 1114 if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) 1115 continue; 1116 1117 // If we have found an integer type use covering the alloca, use that 1118 // regardless of the other types, as integers are often used for a "bucket 1119 // of bits" type. 1120 return ITy; 1121 } 1122 1123 if (Ty && Ty != UserTy) 1124 return 0; 1125 1126 Ty = UserTy; 1127 } 1128 return Ty; 1129} 1130 1131#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1132 1133void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1134 StringRef Indent) const { 1135 OS << Indent << "partition #" << (I - begin()) 1136 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1137 << (I->IsSplittable ? " (splittable)" : "") 1138 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1139 << "\n"; 1140} 1141 1142void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1143 StringRef Indent) const { 1144 for (const_use_iterator UI = use_begin(I), UE = use_end(I); 1145 UI != UE; ++UI) { 1146 if (!UI->U) 1147 continue; // Skip dead uses. 1148 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1149 << "used by: " << *UI->U->getUser() << "\n"; 1150 if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { 1151 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1152 bool IsDest; 1153 if (!MTO.IsSplittable) 1154 IsDest = UI->BeginOffset == MTO.DestBegin; 1155 else 1156 IsDest = MTO.DestBegin != 0u; 1157 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1158 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1159 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1160 } 1161 } 1162} 1163 1164void AllocaPartitioning::print(raw_ostream &OS) const { 1165 if (PointerEscapingInstr) { 1166 OS << "No partitioning for alloca: " << AI << "\n" 1167 << " A pointer to this alloca escaped by:\n" 1168 << " " << *PointerEscapingInstr << "\n"; 1169 return; 1170 } 1171 1172 OS << "Partitioning of alloca: " << AI << "\n"; 1173 unsigned Num = 0; 1174 for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { 1175 print(OS, I); 1176 printUsers(OS, I); 1177 } 1178} 1179 1180void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1181void AllocaPartitioning::dump() const { print(dbgs()); } 1182 1183#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1184 1185 1186namespace { 1187/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1188/// 1189/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1190/// the loads and stores of an alloca instruction, as well as updating its 1191/// debug information. This is used when a domtree is unavailable and thus 1192/// mem2reg in its full form can't be used to handle promotion of allocas to 1193/// scalar values. 1194class AllocaPromoter : public LoadAndStorePromoter { 1195 AllocaInst &AI; 1196 DIBuilder &DIB; 1197 1198 SmallVector<DbgDeclareInst *, 4> DDIs; 1199 SmallVector<DbgValueInst *, 4> DVIs; 1200 1201public: 1202 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1203 AllocaInst &AI, DIBuilder &DIB) 1204 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1205 1206 void run(const SmallVectorImpl<Instruction*> &Insts) { 1207 // Remember which alloca we're promoting (for isInstInList). 1208 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1209 for (Value::use_iterator UI = DebugNode->use_begin(), 1210 UE = DebugNode->use_end(); 1211 UI != UE; ++UI) 1212 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1213 DDIs.push_back(DDI); 1214 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1215 DVIs.push_back(DVI); 1216 } 1217 1218 LoadAndStorePromoter::run(Insts); 1219 AI.eraseFromParent(); 1220 while (!DDIs.empty()) 1221 DDIs.pop_back_val()->eraseFromParent(); 1222 while (!DVIs.empty()) 1223 DVIs.pop_back_val()->eraseFromParent(); 1224 } 1225 1226 virtual bool isInstInList(Instruction *I, 1227 const SmallVectorImpl<Instruction*> &Insts) const { 1228 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1229 return LI->getOperand(0) == &AI; 1230 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1231 } 1232 1233 virtual void updateDebugInfo(Instruction *Inst) const { 1234 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1235 E = DDIs.end(); I != E; ++I) { 1236 DbgDeclareInst *DDI = *I; 1237 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1238 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1239 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1240 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1241 } 1242 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1243 E = DVIs.end(); I != E; ++I) { 1244 DbgValueInst *DVI = *I; 1245 Value *Arg = NULL; 1246 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1247 // If an argument is zero extended then use argument directly. The ZExt 1248 // may be zapped by an optimization pass in future. 1249 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1250 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1251 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1252 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1253 if (!Arg) 1254 Arg = SI->getOperand(0); 1255 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1256 Arg = LI->getOperand(0); 1257 } else { 1258 continue; 1259 } 1260 Instruction *DbgVal = 1261 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1262 Inst); 1263 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1264 } 1265 } 1266}; 1267} // end anon namespace 1268 1269 1270namespace { 1271/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1272/// 1273/// This pass takes allocations which can be completely analyzed (that is, they 1274/// don't escape) and tries to turn them into scalar SSA values. There are 1275/// a few steps to this process. 1276/// 1277/// 1) It takes allocations of aggregates and analyzes the ways in which they 1278/// are used to try to split them into smaller allocations, ideally of 1279/// a single scalar data type. It will split up memcpy and memset accesses 1280/// as necessary and try to isolate invidual scalar accesses. 1281/// 2) It will transform accesses into forms which are suitable for SSA value 1282/// promotion. This can be replacing a memset with a scalar store of an 1283/// integer value, or it can involve speculating operations on a PHI or 1284/// select to be a PHI or select of the results. 1285/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1286/// onto insert and extract operations on a vector value, and convert them to 1287/// this form. By doing so, it will enable promotion of vector aggregates to 1288/// SSA vector values. 1289class SROA : public FunctionPass { 1290 const bool RequiresDomTree; 1291 1292 LLVMContext *C; 1293 const DataLayout *TD; 1294 DominatorTree *DT; 1295 1296 /// \brief Worklist of alloca instructions to simplify. 1297 /// 1298 /// Each alloca in the function is added to this. Each new alloca formed gets 1299 /// added to it as well to recursively simplify unless that alloca can be 1300 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1301 /// the one being actively rewritten, we add it back onto the list if not 1302 /// already present to ensure it is re-visited. 1303 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1304 1305 /// \brief A collection of instructions to delete. 1306 /// We try to batch deletions to simplify code and make things a bit more 1307 /// efficient. 1308 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 1309 1310 /// \brief Post-promotion worklist. 1311 /// 1312 /// Sometimes we discover an alloca which has a high probability of becoming 1313 /// viable for SROA after a round of promotion takes place. In those cases, 1314 /// the alloca is enqueued here for re-processing. 1315 /// 1316 /// Note that we have to be very careful to clear allocas out of this list in 1317 /// the event they are deleted. 1318 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1319 1320 /// \brief A collection of alloca instructions we can directly promote. 1321 std::vector<AllocaInst *> PromotableAllocas; 1322 1323public: 1324 SROA(bool RequiresDomTree = true) 1325 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1326 C(0), TD(0), DT(0) { 1327 initializeSROAPass(*PassRegistry::getPassRegistry()); 1328 } 1329 bool runOnFunction(Function &F); 1330 void getAnalysisUsage(AnalysisUsage &AU) const; 1331 1332 const char *getPassName() const { return "SROA"; } 1333 static char ID; 1334 1335private: 1336 friend class PHIOrSelectSpeculator; 1337 friend class AllocaPartitionRewriter; 1338 friend class AllocaPartitionVectorRewriter; 1339 1340 bool rewriteAllocaPartition(AllocaInst &AI, 1341 AllocaPartitioning &P, 1342 AllocaPartitioning::iterator PI); 1343 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1344 bool runOnAlloca(AllocaInst &AI); 1345 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1346 bool promoteAllocas(Function &F); 1347}; 1348} 1349 1350char SROA::ID = 0; 1351 1352FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1353 return new SROA(RequiresDomTree); 1354} 1355 1356INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1357 false, false) 1358INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1359INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1360 false, false) 1361 1362namespace { 1363/// \brief Visitor to speculate PHIs and Selects where possible. 1364class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1365 // Befriend the base class so it can delegate to private visit methods. 1366 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1367 1368 const DataLayout &TD; 1369 AllocaPartitioning &P; 1370 SROA &Pass; 1371 1372public: 1373 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1374 : TD(TD), P(P), Pass(Pass) {} 1375 1376 /// \brief Visit the users of an alloca partition and rewrite them. 1377 void visitUsers(AllocaPartitioning::const_iterator PI) { 1378 // Note that we need to use an index here as the underlying vector of uses 1379 // may be grown during speculation. However, we never need to re-visit the 1380 // new uses, and so we can use the initial size bound. 1381 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1382 const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); 1383 if (!PU.U) 1384 continue; // Skip dead use. 1385 1386 visit(cast<Instruction>(PU.U->getUser())); 1387 } 1388 } 1389 1390private: 1391 // By default, skip this instruction. 1392 void visitInstruction(Instruction &I) {} 1393 1394 /// PHI instructions that use an alloca and are subsequently loaded can be 1395 /// rewritten to load both input pointers in the pred blocks and then PHI the 1396 /// results, allowing the load of the alloca to be promoted. 1397 /// From this: 1398 /// %P2 = phi [i32* %Alloca, i32* %Other] 1399 /// %V = load i32* %P2 1400 /// to: 1401 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1402 /// ... 1403 /// %V2 = load i32* %Other 1404 /// ... 1405 /// %V = phi [i32 %V1, i32 %V2] 1406 /// 1407 /// We can do this to a select if its only uses are loads and if the operands 1408 /// to the select can be loaded unconditionally. 1409 /// 1410 /// FIXME: This should be hoisted into a generic utility, likely in 1411 /// Transforms/Util/Local.h 1412 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1413 // For now, we can only do this promotion if the load is in the same block 1414 // as the PHI, and if there are no stores between the phi and load. 1415 // TODO: Allow recursive phi users. 1416 // TODO: Allow stores. 1417 BasicBlock *BB = PN.getParent(); 1418 unsigned MaxAlign = 0; 1419 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1420 UI != UE; ++UI) { 1421 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1422 if (LI == 0 || !LI->isSimple()) return false; 1423 1424 // For now we only allow loads in the same block as the PHI. This is 1425 // a common case that happens when instcombine merges two loads through 1426 // a PHI. 1427 if (LI->getParent() != BB) return false; 1428 1429 // Ensure that there are no instructions between the PHI and the load that 1430 // could store. 1431 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1432 if (BBI->mayWriteToMemory()) 1433 return false; 1434 1435 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1436 Loads.push_back(LI); 1437 } 1438 1439 // We can only transform this if it is safe to push the loads into the 1440 // predecessor blocks. The only thing to watch out for is that we can't put 1441 // a possibly trapping load in the predecessor if it is a critical edge. 1442 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; 1443 ++Idx) { 1444 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1445 Value *InVal = PN.getIncomingValue(Idx); 1446 1447 // If the value is produced by the terminator of the predecessor (an 1448 // invoke) or it has side-effects, there is no valid place to put a load 1449 // in the predecessor. 1450 if (TI == InVal || TI->mayHaveSideEffects()) 1451 return false; 1452 1453 // If the predecessor has a single successor, then the edge isn't 1454 // critical. 1455 if (TI->getNumSuccessors() == 1) 1456 continue; 1457 1458 // If this pointer is always safe to load, or if we can prove that there 1459 // is already a load in the block, then we can move the load to the pred 1460 // block. 1461 if (InVal->isDereferenceablePointer() || 1462 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1463 continue; 1464 1465 return false; 1466 } 1467 1468 return true; 1469 } 1470 1471 void visitPHINode(PHINode &PN) { 1472 DEBUG(dbgs() << " original: " << PN << "\n"); 1473 1474 SmallVector<LoadInst *, 4> Loads; 1475 if (!isSafePHIToSpeculate(PN, Loads)) 1476 return; 1477 1478 assert(!Loads.empty()); 1479 1480 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1481 IRBuilder<> PHIBuilder(&PN); 1482 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1483 PN.getName() + ".sroa.speculated"); 1484 1485 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1486 // matter which one we get and if any differ, it doesn't matter. 1487 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1488 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1489 unsigned Align = SomeLoad->getAlignment(); 1490 1491 // Rewrite all loads of the PN to use the new PHI. 1492 do { 1493 LoadInst *LI = Loads.pop_back_val(); 1494 LI->replaceAllUsesWith(NewPN); 1495 Pass.DeadInsts.insert(LI); 1496 } while (!Loads.empty()); 1497 1498 // Inject loads into all of the pred blocks. 1499 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1500 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1501 TerminatorInst *TI = Pred->getTerminator(); 1502 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1503 Value *InVal = PN.getIncomingValue(Idx); 1504 IRBuilder<> PredBuilder(TI); 1505 1506 LoadInst *Load 1507 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1508 Pred->getName())); 1509 ++NumLoadsSpeculated; 1510 Load->setAlignment(Align); 1511 if (TBAATag) 1512 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1513 NewPN->addIncoming(Load, Pred); 1514 1515 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1516 if (!Ptr) 1517 // No uses to rewrite. 1518 continue; 1519 1520 // Try to lookup and rewrite any partition uses corresponding to this phi 1521 // input. 1522 AllocaPartitioning::iterator PI 1523 = P.findPartitionForPHIOrSelectOperand(InUse); 1524 if (PI == P.end()) 1525 continue; 1526 1527 // Replace the Use in the PartitionUse for this operand with the Use 1528 // inside the load. 1529 AllocaPartitioning::use_iterator UI 1530 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1531 assert(isa<PHINode>(*UI->U->getUser())); 1532 UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); 1533 } 1534 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1535 } 1536 1537 /// Select instructions that use an alloca and are subsequently loaded can be 1538 /// rewritten to load both input pointers and then select between the result, 1539 /// allowing the load of the alloca to be promoted. 1540 /// From this: 1541 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1542 /// %V = load i32* %P2 1543 /// to: 1544 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1545 /// %V2 = load i32* %Other 1546 /// %V = select i1 %cond, i32 %V1, i32 %V2 1547 /// 1548 /// We can do this to a select if its only uses are loads and if the operand 1549 /// to the select can be loaded unconditionally. 1550 bool isSafeSelectToSpeculate(SelectInst &SI, 1551 SmallVectorImpl<LoadInst *> &Loads) { 1552 Value *TValue = SI.getTrueValue(); 1553 Value *FValue = SI.getFalseValue(); 1554 bool TDerefable = TValue->isDereferenceablePointer(); 1555 bool FDerefable = FValue->isDereferenceablePointer(); 1556 1557 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1558 UI != UE; ++UI) { 1559 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1560 if (LI == 0 || !LI->isSimple()) return false; 1561 1562 // Both operands to the select need to be dereferencable, either 1563 // absolutely (e.g. allocas) or at this point because we can see other 1564 // accesses to it. 1565 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1566 LI->getAlignment(), &TD)) 1567 return false; 1568 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1569 LI->getAlignment(), &TD)) 1570 return false; 1571 Loads.push_back(LI); 1572 } 1573 1574 return true; 1575 } 1576 1577 void visitSelectInst(SelectInst &SI) { 1578 DEBUG(dbgs() << " original: " << SI << "\n"); 1579 IRBuilder<> IRB(&SI); 1580 1581 // If the select isn't safe to speculate, just use simple logic to emit it. 1582 SmallVector<LoadInst *, 4> Loads; 1583 if (!isSafeSelectToSpeculate(SI, Loads)) 1584 return; 1585 1586 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1587 AllocaPartitioning::iterator PIs[2]; 1588 AllocaPartitioning::PartitionUse PUs[2]; 1589 for (unsigned i = 0, e = 2; i != e; ++i) { 1590 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1591 if (PIs[i] != P.end()) { 1592 // If the pointer is within the partitioning, remove the select from 1593 // its uses. We'll add in the new loads below. 1594 AllocaPartitioning::use_iterator UI 1595 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1596 PUs[i] = *UI; 1597 // Clear out the use here so that the offsets into the use list remain 1598 // stable but this use is ignored when rewriting. 1599 UI->U = 0; 1600 } 1601 } 1602 1603 Value *TV = SI.getTrueValue(); 1604 Value *FV = SI.getFalseValue(); 1605 // Replace the loads of the select with a select of two loads. 1606 while (!Loads.empty()) { 1607 LoadInst *LI = Loads.pop_back_val(); 1608 1609 IRB.SetInsertPoint(LI); 1610 LoadInst *TL = 1611 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1612 LoadInst *FL = 1613 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1614 NumLoadsSpeculated += 2; 1615 1616 // Transfer alignment and TBAA info if present. 1617 TL->setAlignment(LI->getAlignment()); 1618 FL->setAlignment(LI->getAlignment()); 1619 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1620 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1621 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1622 } 1623 1624 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1625 LI->getName() + ".sroa.speculated"); 1626 1627 LoadInst *Loads[2] = { TL, FL }; 1628 for (unsigned i = 0, e = 2; i != e; ++i) { 1629 if (PIs[i] != P.end()) { 1630 Use *LoadUse = &Loads[i]->getOperandUse(0); 1631 assert(PUs[i].U->get() == LoadUse->get()); 1632 PUs[i].U = LoadUse; 1633 P.use_push_back(PIs[i], PUs[i]); 1634 } 1635 } 1636 1637 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1638 LI->replaceAllUsesWith(V); 1639 Pass.DeadInsts.insert(LI); 1640 } 1641 } 1642}; 1643} 1644 1645/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. 1646/// 1647/// If the provided GEP is all-constant, the total byte offset formed by the 1648/// GEP is computed and Offset is set to it. If the GEP has any non-constant 1649/// operands, the function returns false and the value of Offset is unmodified. 1650static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP, 1651 APInt &Offset) { 1652 APInt GEPOffset(Offset.getBitWidth(), 0); 1653 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 1654 GTI != GTE; ++GTI) { 1655 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1656 if (!OpC) 1657 return false; 1658 if (OpC->isZero()) continue; 1659 1660 // Handle a struct index, which adds its field offset to the pointer. 1661 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 1662 unsigned ElementIdx = OpC->getZExtValue(); 1663 const StructLayout *SL = TD.getStructLayout(STy); 1664 GEPOffset += APInt(Offset.getBitWidth(), 1665 SL->getElementOffset(ElementIdx)); 1666 continue; 1667 } 1668 1669 APInt TypeSize(Offset.getBitWidth(), 1670 TD.getTypeAllocSize(GTI.getIndexedType())); 1671 if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) { 1672 assert((VTy->getScalarSizeInBits() % 8) == 0 && 1673 "vector element size is not a multiple of 8, cannot GEP over it"); 1674 TypeSize = VTy->getScalarSizeInBits() / 8; 1675 } 1676 1677 GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; 1678 } 1679 Offset = GEPOffset; 1680 return true; 1681} 1682 1683/// \brief Build a GEP out of a base pointer and indices. 1684/// 1685/// This will return the BasePtr if that is valid, or build a new GEP 1686/// instruction using the IRBuilder if GEP-ing is needed. 1687static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1688 SmallVectorImpl<Value *> &Indices, 1689 const Twine &Prefix) { 1690 if (Indices.empty()) 1691 return BasePtr; 1692 1693 // A single zero index is a no-op, so check for this and avoid building a GEP 1694 // in that case. 1695 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1696 return BasePtr; 1697 1698 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1699} 1700 1701/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1702/// TargetTy without changing the offset of the pointer. 1703/// 1704/// This routine assumes we've already established a properly offset GEP with 1705/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1706/// zero-indices down through type layers until we find one the same as 1707/// TargetTy. If we can't find one with the same type, we at least try to use 1708/// one with the same size. If none of that works, we just produce the GEP as 1709/// indicated by Indices to have the correct offset. 1710static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, 1711 Value *BasePtr, Type *Ty, Type *TargetTy, 1712 SmallVectorImpl<Value *> &Indices, 1713 const Twine &Prefix) { 1714 if (Ty == TargetTy) 1715 return buildGEP(IRB, BasePtr, Indices, Prefix); 1716 1717 // See if we can descend into a struct and locate a field with the correct 1718 // type. 1719 unsigned NumLayers = 0; 1720 Type *ElementTy = Ty; 1721 do { 1722 if (ElementTy->isPointerTy()) 1723 break; 1724 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1725 ElementTy = SeqTy->getElementType(); 1726 // Note that we use the default address space as this index is over an 1727 // array or a vector, not a pointer. 1728 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); 1729 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1730 if (STy->element_begin() == STy->element_end()) 1731 break; // Nothing left to descend into. 1732 ElementTy = *STy->element_begin(); 1733 Indices.push_back(IRB.getInt32(0)); 1734 } else { 1735 break; 1736 } 1737 ++NumLayers; 1738 } while (ElementTy != TargetTy); 1739 if (ElementTy != TargetTy) 1740 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1741 1742 return buildGEP(IRB, BasePtr, Indices, Prefix); 1743} 1744 1745/// \brief Recursively compute indices for a natural GEP. 1746/// 1747/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1748/// element types adding appropriate indices for the GEP. 1749static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, 1750 Value *Ptr, Type *Ty, APInt &Offset, 1751 Type *TargetTy, 1752 SmallVectorImpl<Value *> &Indices, 1753 const Twine &Prefix) { 1754 if (Offset == 0) 1755 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1756 1757 // We can't recurse through pointer types. 1758 if (Ty->isPointerTy()) 1759 return 0; 1760 1761 // We try to analyze GEPs over vectors here, but note that these GEPs are 1762 // extremely poorly defined currently. The long-term goal is to remove GEPing 1763 // over a vector from the IR completely. 1764 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1765 unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); 1766 if (ElementSizeInBits % 8) 1767 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1768 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1769 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1770 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1771 return 0; 1772 Offset -= NumSkippedElements * ElementSize; 1773 Indices.push_back(IRB.getInt(NumSkippedElements)); 1774 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1775 Offset, TargetTy, Indices, Prefix); 1776 } 1777 1778 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1779 Type *ElementTy = ArrTy->getElementType(); 1780 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1781 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1782 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1783 return 0; 1784 1785 Offset -= NumSkippedElements * ElementSize; 1786 Indices.push_back(IRB.getInt(NumSkippedElements)); 1787 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1788 Indices, Prefix); 1789 } 1790 1791 StructType *STy = dyn_cast<StructType>(Ty); 1792 if (!STy) 1793 return 0; 1794 1795 const StructLayout *SL = TD.getStructLayout(STy); 1796 uint64_t StructOffset = Offset.getZExtValue(); 1797 if (StructOffset >= SL->getSizeInBytes()) 1798 return 0; 1799 unsigned Index = SL->getElementContainingOffset(StructOffset); 1800 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1801 Type *ElementTy = STy->getElementType(Index); 1802 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1803 return 0; // The offset points into alignment padding. 1804 1805 Indices.push_back(IRB.getInt32(Index)); 1806 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1807 Indices, Prefix); 1808} 1809 1810/// \brief Get a natural GEP from a base pointer to a particular offset and 1811/// resulting in a particular type. 1812/// 1813/// The goal is to produce a "natural" looking GEP that works with the existing 1814/// composite types to arrive at the appropriate offset and element type for 1815/// a pointer. TargetTy is the element type the returned GEP should point-to if 1816/// possible. We recurse by decreasing Offset, adding the appropriate index to 1817/// Indices, and setting Ty to the result subtype. 1818/// 1819/// If no natural GEP can be constructed, this function returns null. 1820static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, 1821 Value *Ptr, APInt Offset, Type *TargetTy, 1822 SmallVectorImpl<Value *> &Indices, 1823 const Twine &Prefix) { 1824 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1825 1826 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1827 // an i8. 1828 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1829 return 0; 1830 1831 Type *ElementTy = Ty->getElementType(); 1832 if (!ElementTy->isSized()) 1833 return 0; // We can't GEP through an unsized element. 1834 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1835 if (ElementSize == 0) 1836 return 0; // Zero-length arrays can't help us build a natural GEP. 1837 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1838 1839 Offset -= NumSkippedElements * ElementSize; 1840 Indices.push_back(IRB.getInt(NumSkippedElements)); 1841 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1842 Indices, Prefix); 1843} 1844 1845/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1846/// resulting pointer has PointerTy. 1847/// 1848/// This tries very hard to compute a "natural" GEP which arrives at the offset 1849/// and produces the pointer type desired. Where it cannot, it will try to use 1850/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1851/// fails, it will try to use an existing i8* and GEP to the byte offset and 1852/// bitcast to the type. 1853/// 1854/// The strategy for finding the more natural GEPs is to peel off layers of the 1855/// pointer, walking back through bit casts and GEPs, searching for a base 1856/// pointer from which we can compute a natural GEP with the desired 1857/// properities. The algorithm tries to fold as many constant indices into 1858/// a single GEP as possible, thus making each GEP more independent of the 1859/// surrounding code. 1860static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, 1861 Value *Ptr, APInt Offset, Type *PointerTy, 1862 const Twine &Prefix) { 1863 // Even though we don't look through PHI nodes, we could be called on an 1864 // instruction in an unreachable block, which may be on a cycle. 1865 SmallPtrSet<Value *, 4> Visited; 1866 Visited.insert(Ptr); 1867 SmallVector<Value *, 4> Indices; 1868 1869 // We may end up computing an offset pointer that has the wrong type. If we 1870 // never are able to compute one directly that has the correct type, we'll 1871 // fall back to it, so keep it around here. 1872 Value *OffsetPtr = 0; 1873 1874 // Remember any i8 pointer we come across to re-use if we need to do a raw 1875 // byte offset. 1876 Value *Int8Ptr = 0; 1877 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1878 1879 Type *TargetTy = PointerTy->getPointerElementType(); 1880 1881 do { 1882 // First fold any existing GEPs into the offset. 1883 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1884 APInt GEPOffset(Offset.getBitWidth(), 0); 1885 if (!accumulateGEPOffsets(TD, *GEP, GEPOffset)) 1886 break; 1887 Offset += GEPOffset; 1888 Ptr = GEP->getPointerOperand(); 1889 if (!Visited.insert(Ptr)) 1890 break; 1891 } 1892 1893 // See if we can perform a natural GEP here. 1894 Indices.clear(); 1895 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1896 Indices, Prefix)) { 1897 if (P->getType() == PointerTy) { 1898 // Zap any offset pointer that we ended up computing in previous rounds. 1899 if (OffsetPtr && OffsetPtr->use_empty()) 1900 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1901 I->eraseFromParent(); 1902 return P; 1903 } 1904 if (!OffsetPtr) { 1905 OffsetPtr = P; 1906 } 1907 } 1908 1909 // Stash this pointer if we've found an i8*. 1910 if (Ptr->getType()->isIntegerTy(8)) { 1911 Int8Ptr = Ptr; 1912 Int8PtrOffset = Offset; 1913 } 1914 1915 // Peel off a layer of the pointer and update the offset appropriately. 1916 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1917 Ptr = cast<Operator>(Ptr)->getOperand(0); 1918 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1919 if (GA->mayBeOverridden()) 1920 break; 1921 Ptr = GA->getAliasee(); 1922 } else { 1923 break; 1924 } 1925 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1926 } while (Visited.insert(Ptr)); 1927 1928 if (!OffsetPtr) { 1929 if (!Int8Ptr) { 1930 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1931 Prefix + ".raw_cast"); 1932 Int8PtrOffset = Offset; 1933 } 1934 1935 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1936 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1937 Prefix + ".raw_idx"); 1938 } 1939 Ptr = OffsetPtr; 1940 1941 // On the off chance we were targeting i8*, guard the bitcast here. 1942 if (Ptr->getType() != PointerTy) 1943 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 1944 1945 return Ptr; 1946} 1947 1948/// \brief Test whether we can convert a value from the old to the new type. 1949/// 1950/// This predicate should be used to guard calls to convertValue in order to 1951/// ensure that we only try to convert viable values. The strategy is that we 1952/// will peel off single element struct and array wrappings to get to an 1953/// underlying value, and convert that value. 1954static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1955 if (OldTy == NewTy) 1956 return true; 1957 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 1958 return false; 1959 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1960 return false; 1961 1962 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1963 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 1964 return true; 1965 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 1966 return true; 1967 return false; 1968 } 1969 1970 return true; 1971} 1972 1973/// \brief Generic routine to convert an SSA value to a value of a different 1974/// type. 1975/// 1976/// This will try various different casting techniques, such as bitcasts, 1977/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1978/// two types for viability with this routine. 1979static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 1980 Type *Ty) { 1981 assert(canConvertValue(DL, V->getType(), Ty) && 1982 "Value not convertable to type"); 1983 if (V->getType() == Ty) 1984 return V; 1985 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 1986 return IRB.CreateIntToPtr(V, Ty); 1987 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 1988 return IRB.CreatePtrToInt(V, Ty); 1989 1990 return IRB.CreateBitCast(V, Ty); 1991} 1992 1993/// \brief Test whether the given alloca partition can be promoted to a vector. 1994/// 1995/// This is a quick test to check whether we can rewrite a particular alloca 1996/// partition (and its newly formed alloca) into a vector alloca with only 1997/// whole-vector loads and stores such that it could be promoted to a vector 1998/// SSA value. We only can ensure this for a limited set of operations, and we 1999/// don't want to do the rewrites unless we are confident that the result will 2000/// be promotable, so we have an early test here. 2001static bool isVectorPromotionViable(const DataLayout &TD, 2002 Type *AllocaTy, 2003 AllocaPartitioning &P, 2004 uint64_t PartitionBeginOffset, 2005 uint64_t PartitionEndOffset, 2006 AllocaPartitioning::const_use_iterator I, 2007 AllocaPartitioning::const_use_iterator E) { 2008 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 2009 if (!Ty) 2010 return false; 2011 2012 uint64_t VecSize = TD.getTypeSizeInBits(Ty); 2013 uint64_t ElementSize = Ty->getScalarSizeInBits(); 2014 2015 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2016 // that aren't byte sized. 2017 if (ElementSize % 8) 2018 return false; 2019 assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); 2020 VecSize /= 8; 2021 ElementSize /= 8; 2022 2023 for (; I != E; ++I) { 2024 if (!I->U) 2025 continue; // Skip dead use. 2026 2027 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 2028 uint64_t BeginIndex = BeginOffset / ElementSize; 2029 if (BeginIndex * ElementSize != BeginOffset || 2030 BeginIndex >= Ty->getNumElements()) 2031 return false; 2032 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 2033 uint64_t EndIndex = EndOffset / ElementSize; 2034 if (EndIndex * ElementSize != EndOffset || 2035 EndIndex > Ty->getNumElements()) 2036 return false; 2037 2038 assert(EndIndex > BeginIndex && "Empty vector!"); 2039 uint64_t NumElements = EndIndex - BeginIndex; 2040 Type *PartitionTy 2041 = (NumElements == 1) ? Ty->getElementType() 2042 : VectorType::get(Ty->getElementType(), NumElements); 2043 2044 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2045 if (MI->isVolatile()) 2046 return false; 2047 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2048 const AllocaPartitioning::MemTransferOffsets &MTO 2049 = P.getMemTransferOffsets(*MTI); 2050 if (!MTO.IsSplittable) 2051 return false; 2052 } 2053 } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { 2054 // Disable vector promotion when there are loads or stores of an FCA. 2055 return false; 2056 } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2057 if (LI->isVolatile()) 2058 return false; 2059 if (!canConvertValue(TD, PartitionTy, LI->getType())) 2060 return false; 2061 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2062 if (SI->isVolatile()) 2063 return false; 2064 if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) 2065 return false; 2066 } else { 2067 return false; 2068 } 2069 } 2070 return true; 2071} 2072 2073/// \brief Test whether the given alloca partition's integer operations can be 2074/// widened to promotable ones. 2075/// 2076/// This is a quick test to check whether we can rewrite the integer loads and 2077/// stores to a particular alloca into wider loads and stores and be able to 2078/// promote the resulting alloca. 2079static bool isIntegerWideningViable(const DataLayout &TD, 2080 Type *AllocaTy, 2081 uint64_t AllocBeginOffset, 2082 AllocaPartitioning &P, 2083 AllocaPartitioning::const_use_iterator I, 2084 AllocaPartitioning::const_use_iterator E) { 2085 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2086 // Don't create integer types larger than the maximum bitwidth. 2087 if (SizeInBits > IntegerType::MAX_INT_BITS) 2088 return false; 2089 2090 // Don't try to handle allocas with bit-padding. 2091 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2092 return false; 2093 2094 // We need to ensure that an integer type with the appropriate bitwidth can 2095 // be converted to the alloca type, whatever that is. We don't want to force 2096 // the alloca itself to have an integer type if there is a more suitable one. 2097 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2098 if (!canConvertValue(TD, AllocaTy, IntTy) || 2099 !canConvertValue(TD, IntTy, AllocaTy)) 2100 return false; 2101 2102 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2103 2104 // Check the uses to ensure the uses are (likely) promoteable integer uses. 2105 // Also ensure that the alloca has a covering load or store. We don't want 2106 // to widen the integer operotains only to fail to promote due to some other 2107 // unsplittable entry (which we may make splittable later). 2108 bool WholeAllocaOp = false; 2109 for (; I != E; ++I) { 2110 if (!I->U) 2111 continue; // Skip dead use. 2112 2113 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2114 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2115 2116 // We can't reasonably handle cases where the load or store extends past 2117 // the end of the aloca's type and into its padding. 2118 if (RelEnd > Size) 2119 return false; 2120 2121 if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2122 if (LI->isVolatile()) 2123 return false; 2124 if (RelBegin == 0 && RelEnd == Size) 2125 WholeAllocaOp = true; 2126 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2127 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2128 return false; 2129 continue; 2130 } 2131 // Non-integer loads need to be convertible from the alloca type so that 2132 // they are promotable. 2133 if (RelBegin != 0 || RelEnd != Size || 2134 !canConvertValue(TD, AllocaTy, LI->getType())) 2135 return false; 2136 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2137 Type *ValueTy = SI->getValueOperand()->getType(); 2138 if (SI->isVolatile()) 2139 return false; 2140 if (RelBegin == 0 && RelEnd == Size) 2141 WholeAllocaOp = true; 2142 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2143 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2144 return false; 2145 continue; 2146 } 2147 // Non-integer stores need to be convertible to the alloca type so that 2148 // they are promotable. 2149 if (RelBegin != 0 || RelEnd != Size || 2150 !canConvertValue(TD, ValueTy, AllocaTy)) 2151 return false; 2152 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2153 if (MI->isVolatile()) 2154 return false; 2155 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2156 const AllocaPartitioning::MemTransferOffsets &MTO 2157 = P.getMemTransferOffsets(*MTI); 2158 if (!MTO.IsSplittable) 2159 return false; 2160 } 2161 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { 2162 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2163 II->getIntrinsicID() != Intrinsic::lifetime_end) 2164 return false; 2165 } else { 2166 return false; 2167 } 2168 } 2169 return WholeAllocaOp; 2170} 2171 2172static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2173 IntegerType *Ty, uint64_t Offset, 2174 const Twine &Name) { 2175 DEBUG(dbgs() << " start: " << *V << "\n"); 2176 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2177 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2178 "Element extends past full value"); 2179 uint64_t ShAmt = 8*Offset; 2180 if (DL.isBigEndian()) 2181 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2182 if (ShAmt) { 2183 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2184 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2185 } 2186 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2187 "Cannot extract to a larger integer!"); 2188 if (Ty != IntTy) { 2189 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2190 DEBUG(dbgs() << " trunced: " << *V << "\n"); 2191 } 2192 return V; 2193} 2194 2195static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, 2196 Value *V, uint64_t Offset, const Twine &Name) { 2197 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2198 IntegerType *Ty = cast<IntegerType>(V->getType()); 2199 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2200 "Cannot insert a larger integer!"); 2201 DEBUG(dbgs() << " start: " << *V << "\n"); 2202 if (Ty != IntTy) { 2203 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2204 DEBUG(dbgs() << " extended: " << *V << "\n"); 2205 } 2206 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2207 "Element store outside of alloca store"); 2208 uint64_t ShAmt = 8*Offset; 2209 if (DL.isBigEndian()) 2210 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2211 if (ShAmt) { 2212 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2213 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2214 } 2215 2216 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2217 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2218 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2219 DEBUG(dbgs() << " masked: " << *Old << "\n"); 2220 V = IRB.CreateOr(Old, V, Name + ".insert"); 2221 DEBUG(dbgs() << " inserted: " << *V << "\n"); 2222 } 2223 return V; 2224} 2225 2226namespace { 2227/// \brief Visitor to rewrite instructions using a partition of an alloca to 2228/// use a new alloca. 2229/// 2230/// Also implements the rewriting to vector-based accesses when the partition 2231/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2232/// lives here. 2233class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2234 bool> { 2235 // Befriend the base class so it can delegate to private visit methods. 2236 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2237 2238 const DataLayout &TD; 2239 AllocaPartitioning &P; 2240 SROA &Pass; 2241 AllocaInst &OldAI, &NewAI; 2242 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2243 Type *NewAllocaTy; 2244 2245 // If we are rewriting an alloca partition which can be written as pure 2246 // vector operations, we stash extra information here. When VecTy is 2247 // non-null, we have some strict guarantees about the rewriten alloca: 2248 // - The new alloca is exactly the size of the vector type here. 2249 // - The accesses all either map to the entire vector or to a single 2250 // element. 2251 // - The set of accessing instructions is only one of those handled above 2252 // in isVectorPromotionViable. Generally these are the same access kinds 2253 // which are promotable via mem2reg. 2254 VectorType *VecTy; 2255 Type *ElementTy; 2256 uint64_t ElementSize; 2257 2258 // This is a convenience and flag variable that will be null unless the new 2259 // alloca's integer operations should be widened to this integer type due to 2260 // passing isIntegerWideningViable above. If it is non-null, the desired 2261 // integer type will be stored here for easy access during rewriting. 2262 IntegerType *IntTy; 2263 2264 // The offset of the partition user currently being rewritten. 2265 uint64_t BeginOffset, EndOffset; 2266 Use *OldUse; 2267 Instruction *OldPtr; 2268 2269 // The name prefix to use when rewriting instructions for this alloca. 2270 std::string NamePrefix; 2271 2272public: 2273 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2274 AllocaPartitioning::iterator PI, 2275 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2276 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2277 : TD(TD), P(P), Pass(Pass), 2278 OldAI(OldAI), NewAI(NewAI), 2279 NewAllocaBeginOffset(NewBeginOffset), 2280 NewAllocaEndOffset(NewEndOffset), 2281 NewAllocaTy(NewAI.getAllocatedType()), 2282 VecTy(), ElementTy(), ElementSize(), IntTy(), 2283 BeginOffset(), EndOffset() { 2284 } 2285 2286 /// \brief Visit the users of the alloca partition and rewrite them. 2287 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2288 AllocaPartitioning::const_use_iterator E) { 2289 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2290 NewAllocaBeginOffset, NewAllocaEndOffset, 2291 I, E)) { 2292 ++NumVectorized; 2293 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2294 ElementTy = VecTy->getElementType(); 2295 assert((VecTy->getScalarSizeInBits() % 8) == 0 && 2296 "Only multiple-of-8 sized vector elements are viable"); 2297 ElementSize = VecTy->getScalarSizeInBits() / 8; 2298 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2299 NewAllocaBeginOffset, P, I, E)) { 2300 IntTy = Type::getIntNTy(NewAI.getContext(), 2301 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2302 } 2303 bool CanSROA = true; 2304 for (; I != E; ++I) { 2305 if (!I->U) 2306 continue; // Skip dead uses. 2307 BeginOffset = I->BeginOffset; 2308 EndOffset = I->EndOffset; 2309 OldUse = I->U; 2310 OldPtr = cast<Instruction>(I->U->get()); 2311 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2312 CanSROA &= visit(cast<Instruction>(I->U->getUser())); 2313 } 2314 if (VecTy) { 2315 assert(CanSROA); 2316 VecTy = 0; 2317 ElementTy = 0; 2318 ElementSize = 0; 2319 } 2320 if (IntTy) { 2321 assert(CanSROA); 2322 IntTy = 0; 2323 } 2324 return CanSROA; 2325 } 2326 2327private: 2328 // Every instruction which can end up as a user must have a rewrite rule. 2329 bool visitInstruction(Instruction &I) { 2330 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2331 llvm_unreachable("No rewrite rule for this instruction!"); 2332 } 2333 2334 Twine getName(const Twine &Suffix) { 2335 return NamePrefix + Suffix; 2336 } 2337 2338 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 2339 assert(BeginOffset >= NewAllocaBeginOffset); 2340 APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); 2341 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2342 } 2343 2344 /// \brief Compute suitable alignment to access an offset into the new alloca. 2345 unsigned getOffsetAlign(uint64_t Offset) { 2346 unsigned NewAIAlign = NewAI.getAlignment(); 2347 if (!NewAIAlign) 2348 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2349 return MinAlign(NewAIAlign, Offset); 2350 } 2351 2352 /// \brief Compute suitable alignment to access this partition of the new 2353 /// alloca. 2354 unsigned getPartitionAlign() { 2355 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2356 } 2357 2358 /// \brief Compute suitable alignment to access a type at an offset of the 2359 /// new alloca. 2360 /// 2361 /// \returns zero if the type's ABI alignment is a suitable alignment, 2362 /// otherwise returns the maximal suitable alignment. 2363 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2364 unsigned Align = getOffsetAlign(Offset); 2365 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2366 } 2367 2368 /// \brief Compute suitable alignment to access a type at the beginning of 2369 /// this partition of the new alloca. 2370 /// 2371 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2372 unsigned getPartitionTypeAlign(Type *Ty) { 2373 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2374 } 2375 2376 unsigned getIndex(uint64_t Offset) { 2377 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2378 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2379 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2380 uint32_t Index = RelOffset / ElementSize; 2381 assert(Index * ElementSize == RelOffset); 2382 return Index; 2383 } 2384 2385 void deleteIfTriviallyDead(Value *V) { 2386 Instruction *I = cast<Instruction>(V); 2387 if (isInstructionTriviallyDead(I)) 2388 Pass.DeadInsts.insert(I); 2389 } 2390 2391 Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { 2392 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2393 getName(".load")); 2394 unsigned BeginIndex = getIndex(BeginOffset); 2395 unsigned EndIndex = getIndex(EndOffset); 2396 assert(EndIndex > BeginIndex && "Empty vector!"); 2397 unsigned NumElements = EndIndex - BeginIndex; 2398 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2399 if (NumElements == 1) { 2400 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2401 getName(".extract")); 2402 DEBUG(dbgs() << " extract: " << *V << "\n"); 2403 } else if (NumElements < VecTy->getNumElements()) { 2404 SmallVector<Constant*, 8> Mask; 2405 Mask.reserve(NumElements); 2406 for (unsigned i = BeginIndex; i != EndIndex; ++i) 2407 Mask.push_back(IRB.getInt32(i)); 2408 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2409 ConstantVector::get(Mask), 2410 getName(".extract")); 2411 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2412 } 2413 return V; 2414 } 2415 2416 Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 2417 assert(IntTy && "We cannot insert an integer to the alloca"); 2418 assert(!LI.isVolatile()); 2419 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2420 getName(".load")); 2421 V = convertValue(TD, IRB, V, IntTy); 2422 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2423 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2424 if (Offset > 0 || EndOffset < NewAllocaEndOffset) 2425 V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2426 getName(".extract")); 2427 return V; 2428 } 2429 2430 bool visitLoadInst(LoadInst &LI) { 2431 DEBUG(dbgs() << " original: " << LI << "\n"); 2432 Value *OldOp = LI.getOperand(0); 2433 assert(OldOp == OldPtr); 2434 IRBuilder<> IRB(&LI); 2435 2436 uint64_t Size = EndOffset - BeginOffset; 2437 bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); 2438 2439 // If this memory access can be shown to *statically* extend outside the 2440 // bounds of the original allocation it's behavior is undefined. Rather 2441 // than trying to transform it, just replace it with undef. 2442 // FIXME: We should do something more clever for functions being 2443 // instrumented by asan. 2444 // FIXME: Eventually, once ASan and friends can flush out bugs here, this 2445 // should be transformed to a load of null making it unreachable. 2446 uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); 2447 if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { 2448 LI.replaceAllUsesWith(UndefValue::get(LI.getType())); 2449 Pass.DeadInsts.insert(&LI); 2450 deleteIfTriviallyDead(OldOp); 2451 DEBUG(dbgs() << " to: undef!!\n"); 2452 return true; 2453 } 2454 2455 Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) 2456 : LI.getType(); 2457 bool IsPtrAdjusted = false; 2458 Value *V; 2459 if (VecTy) { 2460 V = rewriteVectorizedLoadInst(IRB, LI, OldOp); 2461 } else if (IntTy && LI.getType()->isIntegerTy()) { 2462 V = rewriteIntegerLoad(IRB, LI); 2463 } else if (BeginOffset == NewAllocaBeginOffset && 2464 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2465 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2466 LI.isVolatile(), getName(".load")); 2467 } else { 2468 Type *LTy = TargetTy->getPointerTo(); 2469 V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), 2470 getPartitionTypeAlign(TargetTy), 2471 LI.isVolatile(), getName(".load")); 2472 IsPtrAdjusted = true; 2473 } 2474 V = convertValue(TD, IRB, V, TargetTy); 2475 2476 if (IsSplitIntLoad) { 2477 assert(!LI.isVolatile()); 2478 assert(LI.getType()->isIntegerTy() && 2479 "Only integer type loads and stores are split"); 2480 assert(LI.getType()->getIntegerBitWidth() == 2481 TD.getTypeStoreSizeInBits(LI.getType()) && 2482 "Non-byte-multiple bit width"); 2483 assert(LI.getType()->getIntegerBitWidth() == 2484 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2485 "Only alloca-wide loads can be split and recomposed"); 2486 // Move the insertion point just past the load so that we can refer to it. 2487 IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); 2488 // Create a placeholder value with the same type as LI to use as the 2489 // basis for the new value. This allows us to replace the uses of LI with 2490 // the computed value, and then replace the placeholder with LI, leaving 2491 // LI only used for this computation. 2492 Value *Placeholder 2493 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 2494 V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, 2495 getName(".insert")); 2496 LI.replaceAllUsesWith(V); 2497 Placeholder->replaceAllUsesWith(&LI); 2498 delete Placeholder; 2499 } else { 2500 LI.replaceAllUsesWith(V); 2501 } 2502 2503 Pass.DeadInsts.insert(&LI); 2504 deleteIfTriviallyDead(OldOp); 2505 DEBUG(dbgs() << " to: " << *V << "\n"); 2506 return !LI.isVolatile() && !IsPtrAdjusted; 2507 } 2508 2509 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, 2510 StoreInst &SI, Value *OldOp) { 2511 unsigned BeginIndex = getIndex(BeginOffset); 2512 unsigned EndIndex = getIndex(EndOffset); 2513 assert(EndIndex > BeginIndex && "Empty vector!"); 2514 unsigned NumElements = EndIndex - BeginIndex; 2515 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2516 Type *PartitionTy 2517 = (NumElements == 1) ? ElementTy 2518 : VectorType::get(ElementTy, NumElements); 2519 if (V->getType() != PartitionTy) 2520 V = convertValue(TD, IRB, V, PartitionTy); 2521 if (NumElements < VecTy->getNumElements()) { 2522 // We need to mix in the existing elements. 2523 LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2524 getName(".load")); 2525 if (NumElements == 1) { 2526 V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex), 2527 getName(".insert")); 2528 DEBUG(dbgs() << " insert: " << *V << "\n"); 2529 } else { 2530 // When inserting a smaller vector into the larger to store, we first 2531 // use a shuffle vector to widen it with undef elements, and then 2532 // a second shuffle vector to select between the loaded vector and the 2533 // incoming vector. 2534 SmallVector<Constant*, 8> Mask; 2535 Mask.reserve(VecTy->getNumElements()); 2536 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2537 if (i >= BeginIndex && i < EndIndex) 2538 Mask.push_back(IRB.getInt32(i - BeginIndex)); 2539 else 2540 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 2541 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2542 ConstantVector::get(Mask), 2543 getName(".expand")); 2544 DEBUG(dbgs() << " shuffle1: " << *V << "\n"); 2545 2546 Mask.clear(); 2547 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2548 if (i >= BeginIndex && i < EndIndex) 2549 Mask.push_back(IRB.getInt32(i)); 2550 else 2551 Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); 2552 V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask), 2553 getName("insert")); 2554 DEBUG(dbgs() << " shuffle2: " << *V << "\n"); 2555 } 2556 } else { 2557 V = convertValue(TD, IRB, V, VecTy); 2558 } 2559 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2560 Pass.DeadInsts.insert(&SI); 2561 2562 (void)Store; 2563 DEBUG(dbgs() << " to: " << *Store << "\n"); 2564 return true; 2565 } 2566 2567 bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { 2568 assert(IntTy && "We cannot extract an integer from the alloca"); 2569 assert(!SI.isVolatile()); 2570 if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2571 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2572 getName(".oldload")); 2573 Old = convertValue(TD, IRB, Old, IntTy); 2574 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2575 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2576 V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, 2577 getName(".insert")); 2578 } 2579 V = convertValue(TD, IRB, V, NewAllocaTy); 2580 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2581 Pass.DeadInsts.insert(&SI); 2582 (void)Store; 2583 DEBUG(dbgs() << " to: " << *Store << "\n"); 2584 return true; 2585 } 2586 2587 bool visitStoreInst(StoreInst &SI) { 2588 DEBUG(dbgs() << " original: " << SI << "\n"); 2589 Value *OldOp = SI.getOperand(1); 2590 assert(OldOp == OldPtr); 2591 IRBuilder<> IRB(&SI); 2592 2593 Value *V = SI.getValueOperand(); 2594 2595 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2596 // alloca that should be re-examined after promoting this alloca. 2597 if (V->getType()->isPointerTy()) 2598 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2599 Pass.PostPromotionWorklist.insert(AI); 2600 2601 uint64_t Size = EndOffset - BeginOffset; 2602 if (Size < TD.getTypeStoreSize(V->getType())) { 2603 assert(!SI.isVolatile()); 2604 assert(V->getType()->isIntegerTy() && 2605 "Only integer type loads and stores are split"); 2606 assert(V->getType()->getIntegerBitWidth() == 2607 TD.getTypeStoreSizeInBits(V->getType()) && 2608 "Non-byte-multiple bit width"); 2609 assert(V->getType()->getIntegerBitWidth() == 2610 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2611 "Only alloca-wide stores can be split and recomposed"); 2612 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); 2613 V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, 2614 getName(".extract")); 2615 } 2616 2617 if (VecTy) 2618 return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); 2619 if (IntTy && V->getType()->isIntegerTy()) 2620 return rewriteIntegerStore(IRB, V, SI); 2621 2622 StoreInst *NewSI; 2623 if (BeginOffset == NewAllocaBeginOffset && 2624 canConvertValue(TD, V->getType(), NewAllocaTy)) { 2625 V = convertValue(TD, IRB, V, NewAllocaTy); 2626 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2627 SI.isVolatile()); 2628 } else { 2629 Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); 2630 NewSI = IRB.CreateAlignedStore(V, NewPtr, 2631 getPartitionTypeAlign(V->getType()), 2632 SI.isVolatile()); 2633 } 2634 (void)NewSI; 2635 Pass.DeadInsts.insert(&SI); 2636 deleteIfTriviallyDead(OldOp); 2637 2638 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2639 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 2640 } 2641 2642 /// \brief Compute an integer value from splatting an i8 across the given 2643 /// number of bytes. 2644 /// 2645 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 2646 /// call this routine. 2647 /// FIXME: Heed the abvice above. 2648 /// 2649 /// \param V The i8 value to splat. 2650 /// \param Size The number of bytes in the output (assuming i8 is one byte) 2651 Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) { 2652 assert(Size > 0 && "Expected a positive number of bytes."); 2653 IntegerType *VTy = cast<IntegerType>(V->getType()); 2654 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 2655 if (Size == 1) 2656 return V; 2657 2658 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2659 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2660 ConstantExpr::getUDiv( 2661 Constant::getAllOnesValue(SplatIntTy), 2662 ConstantExpr::getZExt( 2663 Constant::getAllOnesValue(V->getType()), 2664 SplatIntTy)), 2665 getName(".isplat")); 2666 return V; 2667 } 2668 2669 bool visitMemSetInst(MemSetInst &II) { 2670 DEBUG(dbgs() << " original: " << II << "\n"); 2671 IRBuilder<> IRB(&II); 2672 assert(II.getRawDest() == OldPtr); 2673 2674 // If the memset has a variable size, it cannot be split, just adjust the 2675 // pointer to the new alloca. 2676 if (!isa<Constant>(II.getLength())) { 2677 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2678 Type *CstTy = II.getAlignmentCst()->getType(); 2679 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2680 2681 deleteIfTriviallyDead(OldPtr); 2682 return false; 2683 } 2684 2685 // Record this instruction for deletion. 2686 Pass.DeadInsts.insert(&II); 2687 2688 Type *AllocaTy = NewAI.getAllocatedType(); 2689 Type *ScalarTy = AllocaTy->getScalarType(); 2690 2691 // If this doesn't map cleanly onto the alloca type, and that type isn't 2692 // a single value type, just emit a memset. 2693 if (!VecTy && !IntTy && 2694 (BeginOffset != NewAllocaBeginOffset || 2695 EndOffset != NewAllocaEndOffset || 2696 !AllocaTy->isSingleValueType() || 2697 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { 2698 Type *SizeTy = II.getLength()->getType(); 2699 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2700 CallInst *New 2701 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2702 II.getRawDest()->getType()), 2703 II.getValue(), Size, getPartitionAlign(), 2704 II.isVolatile()); 2705 (void)New; 2706 DEBUG(dbgs() << " to: " << *New << "\n"); 2707 return false; 2708 } 2709 2710 // If we can represent this as a simple value, we have to build the actual 2711 // value to store, which requires expanding the byte present in memset to 2712 // a sensible representation for the alloca type. This is essentially 2713 // splatting the byte to a sufficiently wide integer, bitcasting to the 2714 // desired scalar type, and splatting it across any desired vector type. 2715 uint64_t Size = EndOffset - BeginOffset; 2716 Value *V = getIntegerSplat(IRB, II.getValue(), Size); 2717 2718 // If this is an element-wide memset of a vectorizable alloca, insert it. 2719 if (VecTy && (BeginOffset > NewAllocaBeginOffset || 2720 EndOffset < NewAllocaEndOffset)) { 2721 if (V->getType() != ScalarTy) 2722 V = convertValue(TD, IRB, V, ScalarTy); 2723 StoreInst *Store = IRB.CreateAlignedStore( 2724 IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, 2725 NewAI.getAlignment(), 2726 getName(".load")), 2727 V, IRB.getInt32(getIndex(BeginOffset)), 2728 getName(".insert")), 2729 &NewAI, NewAI.getAlignment()); 2730 (void)Store; 2731 DEBUG(dbgs() << " to: " << *Store << "\n"); 2732 return true; 2733 } 2734 2735 // If this is a memset on an alloca where we can widen stores, insert the 2736 // set integer. 2737 if (IntTy && (BeginOffset > NewAllocaBeginOffset || 2738 EndOffset < NewAllocaEndOffset)) { 2739 assert(!II.isVolatile()); 2740 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2741 getName(".oldload")); 2742 Old = convertValue(TD, IRB, Old, IntTy); 2743 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2744 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2745 V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); 2746 } 2747 2748 if (V->getType() != AllocaTy) 2749 V = convertValue(TD, IRB, V, AllocaTy); 2750 2751 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2752 II.isVolatile()); 2753 (void)New; 2754 DEBUG(dbgs() << " to: " << *New << "\n"); 2755 return !II.isVolatile(); 2756 } 2757 2758 bool visitMemTransferInst(MemTransferInst &II) { 2759 // Rewriting of memory transfer instructions can be a bit tricky. We break 2760 // them into two categories: split intrinsics and unsplit intrinsics. 2761 2762 DEBUG(dbgs() << " original: " << II << "\n"); 2763 IRBuilder<> IRB(&II); 2764 2765 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2766 bool IsDest = II.getRawDest() == OldPtr; 2767 2768 const AllocaPartitioning::MemTransferOffsets &MTO 2769 = P.getMemTransferOffsets(II); 2770 2771 // Compute the relative offset within the transfer. 2772 unsigned IntPtrWidth = TD.getPointerSizeInBits(); 2773 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2774 : MTO.SourceBegin)); 2775 2776 unsigned Align = II.getAlignment(); 2777 if (Align > 1) 2778 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2779 MinAlign(II.getAlignment(), getPartitionAlign())); 2780 2781 // For unsplit intrinsics, we simply modify the source and destination 2782 // pointers in place. This isn't just an optimization, it is a matter of 2783 // correctness. With unsplit intrinsics we may be dealing with transfers 2784 // within a single alloca before SROA ran, or with transfers that have 2785 // a variable length. We may also be dealing with memmove instead of 2786 // memcpy, and so simply updating the pointers is the necessary for us to 2787 // update both source and dest of a single call. 2788 if (!MTO.IsSplittable) { 2789 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2790 if (IsDest) 2791 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2792 else 2793 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2794 2795 Type *CstTy = II.getAlignmentCst()->getType(); 2796 II.setAlignment(ConstantInt::get(CstTy, Align)); 2797 2798 DEBUG(dbgs() << " to: " << II << "\n"); 2799 deleteIfTriviallyDead(OldOp); 2800 return false; 2801 } 2802 // For split transfer intrinsics we have an incredibly useful assurance: 2803 // the source and destination do not reside within the same alloca, and at 2804 // least one of them does not escape. This means that we can replace 2805 // memmove with memcpy, and we don't need to worry about all manner of 2806 // downsides to splitting and transforming the operations. 2807 2808 // If this doesn't map cleanly onto the alloca type, and that type isn't 2809 // a single value type, just emit a memcpy. 2810 bool EmitMemCpy 2811 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2812 EndOffset != NewAllocaEndOffset || 2813 !NewAI.getAllocatedType()->isSingleValueType()); 2814 2815 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2816 // size hasn't been shrunk based on analysis of the viable range, this is 2817 // a no-op. 2818 if (EmitMemCpy && &OldAI == &NewAI) { 2819 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2820 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2821 // Ensure the start lines up. 2822 assert(BeginOffset == OrigBegin); 2823 (void)OrigBegin; 2824 2825 // Rewrite the size as needed. 2826 if (EndOffset != OrigEnd) 2827 II.setLength(ConstantInt::get(II.getLength()->getType(), 2828 EndOffset - BeginOffset)); 2829 return false; 2830 } 2831 // Record this instruction for deletion. 2832 Pass.DeadInsts.insert(&II); 2833 2834 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2835 EndOffset == NewAllocaEndOffset; 2836 bool IsVectorElement = VecTy && !IsWholeAlloca; 2837 uint64_t Size = EndOffset - BeginOffset; 2838 IntegerType *SubIntTy 2839 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2840 2841 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2842 : II.getRawDest()->getType(); 2843 if (!EmitMemCpy) { 2844 if (IsVectorElement) 2845 OtherPtrTy = VecTy->getElementType()->getPointerTo(); 2846 else if (IntTy && !IsWholeAlloca) 2847 OtherPtrTy = SubIntTy->getPointerTo(); 2848 else 2849 OtherPtrTy = NewAI.getType(); 2850 } 2851 2852 // Compute the other pointer, folding as much as possible to produce 2853 // a single, simple GEP in most cases. 2854 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2855 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2856 getName("." + OtherPtr->getName())); 2857 2858 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2859 // alloca that should be re-examined after rewriting this instruction. 2860 if (AllocaInst *AI 2861 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2862 Pass.Worklist.insert(AI); 2863 2864 if (EmitMemCpy) { 2865 Value *OurPtr 2866 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2867 : II.getRawSource()->getType()); 2868 Type *SizeTy = II.getLength()->getType(); 2869 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2870 2871 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2872 IsDest ? OtherPtr : OurPtr, 2873 Size, Align, II.isVolatile()); 2874 (void)New; 2875 DEBUG(dbgs() << " to: " << *New << "\n"); 2876 return false; 2877 } 2878 2879 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2880 // is equivalent to 1, but that isn't true if we end up rewriting this as 2881 // a load or store. 2882 if (!Align) 2883 Align = 1; 2884 2885 Value *SrcPtr = OtherPtr; 2886 Value *DstPtr = &NewAI; 2887 if (!IsDest) 2888 std::swap(SrcPtr, DstPtr); 2889 2890 Value *Src; 2891 if (IsVectorElement && !IsDest) { 2892 // We have to extract rather than load. 2893 Src = IRB.CreateExtractElement( 2894 IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), 2895 IRB.getInt32(getIndex(BeginOffset)), 2896 getName(".copyextract")); 2897 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2898 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2899 getName(".load")); 2900 Src = convertValue(TD, IRB, Src, IntTy); 2901 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2902 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2903 Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); 2904 } else { 2905 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2906 getName(".copyload")); 2907 } 2908 2909 if (IntTy && !IsWholeAlloca && IsDest) { 2910 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2911 getName(".oldload")); 2912 Old = convertValue(TD, IRB, Old, IntTy); 2913 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2914 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2915 Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); 2916 Src = convertValue(TD, IRB, Src, NewAllocaTy); 2917 } 2918 2919 if (IsVectorElement && IsDest) { 2920 // We have to insert into a loaded copy before storing. 2921 Src = IRB.CreateInsertElement( 2922 IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), 2923 Src, IRB.getInt32(getIndex(BeginOffset)), 2924 getName(".insert")); 2925 } 2926 2927 StoreInst *Store = cast<StoreInst>( 2928 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2929 (void)Store; 2930 DEBUG(dbgs() << " to: " << *Store << "\n"); 2931 return !II.isVolatile(); 2932 } 2933 2934 bool visitIntrinsicInst(IntrinsicInst &II) { 2935 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2936 II.getIntrinsicID() == Intrinsic::lifetime_end); 2937 DEBUG(dbgs() << " original: " << II << "\n"); 2938 IRBuilder<> IRB(&II); 2939 assert(II.getArgOperand(1) == OldPtr); 2940 2941 // Record this instruction for deletion. 2942 Pass.DeadInsts.insert(&II); 2943 2944 ConstantInt *Size 2945 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2946 EndOffset - BeginOffset); 2947 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2948 Value *New; 2949 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2950 New = IRB.CreateLifetimeStart(Ptr, Size); 2951 else 2952 New = IRB.CreateLifetimeEnd(Ptr, Size); 2953 2954 DEBUG(dbgs() << " to: " << *New << "\n"); 2955 return true; 2956 } 2957 2958 bool visitPHINode(PHINode &PN) { 2959 DEBUG(dbgs() << " original: " << PN << "\n"); 2960 2961 // We would like to compute a new pointer in only one place, but have it be 2962 // as local as possible to the PHI. To do that, we re-use the location of 2963 // the old pointer, which necessarily must be in the right position to 2964 // dominate the PHI. 2965 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2966 2967 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2968 // Replace the operands which were using the old pointer. 2969 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 2970 2971 DEBUG(dbgs() << " to: " << PN << "\n"); 2972 deleteIfTriviallyDead(OldPtr); 2973 return false; 2974 } 2975 2976 bool visitSelectInst(SelectInst &SI) { 2977 DEBUG(dbgs() << " original: " << SI << "\n"); 2978 IRBuilder<> IRB(&SI); 2979 2980 // Find the operand we need to rewrite here. 2981 bool IsTrueVal = SI.getTrueValue() == OldPtr; 2982 if (IsTrueVal) 2983 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 2984 else 2985 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 2986 2987 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 2988 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 2989 DEBUG(dbgs() << " to: " << SI << "\n"); 2990 deleteIfTriviallyDead(OldPtr); 2991 return false; 2992 } 2993 2994}; 2995} 2996 2997namespace { 2998/// \brief Visitor to rewrite aggregate loads and stores as scalar. 2999/// 3000/// This pass aggressively rewrites all aggregate loads and stores on 3001/// a particular pointer (or any pointer derived from it which we can identify) 3002/// with scalar loads and stores. 3003class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3004 // Befriend the base class so it can delegate to private visit methods. 3005 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 3006 3007 const DataLayout &TD; 3008 3009 /// Queue of pointer uses to analyze and potentially rewrite. 3010 SmallVector<Use *, 8> Queue; 3011 3012 /// Set to prevent us from cycling with phi nodes and loops. 3013 SmallPtrSet<User *, 8> Visited; 3014 3015 /// The current pointer use being rewritten. This is used to dig up the used 3016 /// value (as opposed to the user). 3017 Use *U; 3018 3019public: 3020 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 3021 3022 /// Rewrite loads and stores through a pointer and all pointers derived from 3023 /// it. 3024 bool rewrite(Instruction &I) { 3025 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3026 enqueueUsers(I); 3027 bool Changed = false; 3028 while (!Queue.empty()) { 3029 U = Queue.pop_back_val(); 3030 Changed |= visit(cast<Instruction>(U->getUser())); 3031 } 3032 return Changed; 3033 } 3034 3035private: 3036 /// Enqueue all the users of the given instruction for further processing. 3037 /// This uses a set to de-duplicate users. 3038 void enqueueUsers(Instruction &I) { 3039 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 3040 ++UI) 3041 if (Visited.insert(*UI)) 3042 Queue.push_back(&UI.getUse()); 3043 } 3044 3045 // Conservative default is to not rewrite anything. 3046 bool visitInstruction(Instruction &I) { return false; } 3047 3048 /// \brief Generic recursive split emission class. 3049 template <typename Derived> 3050 class OpSplitter { 3051 protected: 3052 /// The builder used to form new instructions. 3053 IRBuilder<> IRB; 3054 /// The indices which to be used with insert- or extractvalue to select the 3055 /// appropriate value within the aggregate. 3056 SmallVector<unsigned, 4> Indices; 3057 /// The indices to a GEP instruction which will move Ptr to the correct slot 3058 /// within the aggregate. 3059 SmallVector<Value *, 4> GEPIndices; 3060 /// The base pointer of the original op, used as a base for GEPing the 3061 /// split operations. 3062 Value *Ptr; 3063 3064 /// Initialize the splitter with an insertion point, Ptr and start with a 3065 /// single zero GEP index. 3066 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 3067 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 3068 3069 public: 3070 /// \brief Generic recursive split emission routine. 3071 /// 3072 /// This method recursively splits an aggregate op (load or store) into 3073 /// scalar or vector ops. It splits recursively until it hits a single value 3074 /// and emits that single value operation via the template argument. 3075 /// 3076 /// The logic of this routine relies on GEPs and insertvalue and 3077 /// extractvalue all operating with the same fundamental index list, merely 3078 /// formatted differently (GEPs need actual values). 3079 /// 3080 /// \param Ty The type being split recursively into smaller ops. 3081 /// \param Agg The aggregate value being built up or stored, depending on 3082 /// whether this is splitting a load or a store respectively. 3083 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3084 if (Ty->isSingleValueType()) 3085 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 3086 3087 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3088 unsigned OldSize = Indices.size(); 3089 (void)OldSize; 3090 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3091 ++Idx) { 3092 assert(Indices.size() == OldSize && "Did not return to the old size"); 3093 Indices.push_back(Idx); 3094 GEPIndices.push_back(IRB.getInt32(Idx)); 3095 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3096 GEPIndices.pop_back(); 3097 Indices.pop_back(); 3098 } 3099 return; 3100 } 3101 3102 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3103 unsigned OldSize = Indices.size(); 3104 (void)OldSize; 3105 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3106 ++Idx) { 3107 assert(Indices.size() == OldSize && "Did not return to the old size"); 3108 Indices.push_back(Idx); 3109 GEPIndices.push_back(IRB.getInt32(Idx)); 3110 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3111 GEPIndices.pop_back(); 3112 Indices.pop_back(); 3113 } 3114 return; 3115 } 3116 3117 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3118 } 3119 }; 3120 3121 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3122 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3123 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3124 3125 /// Emit a leaf load of a single value. This is called at the leaves of the 3126 /// recursive emission to actually load values. 3127 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3128 assert(Ty->isSingleValueType()); 3129 // Load the single value and insert it using the indices. 3130 Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, 3131 Name + ".gep"), 3132 Name + ".load"); 3133 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3134 DEBUG(dbgs() << " to: " << *Load << "\n"); 3135 } 3136 }; 3137 3138 bool visitLoadInst(LoadInst &LI) { 3139 assert(LI.getPointerOperand() == *U); 3140 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3141 return false; 3142 3143 // We have an aggregate being loaded, split it apart. 3144 DEBUG(dbgs() << " original: " << LI << "\n"); 3145 LoadOpSplitter Splitter(&LI, *U); 3146 Value *V = UndefValue::get(LI.getType()); 3147 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3148 LI.replaceAllUsesWith(V); 3149 LI.eraseFromParent(); 3150 return true; 3151 } 3152 3153 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3154 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3155 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3156 3157 /// Emit a leaf store of a single value. This is called at the leaves of the 3158 /// recursive emission to actually produce stores. 3159 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3160 assert(Ty->isSingleValueType()); 3161 // Extract the single value and store it using the indices. 3162 Value *Store = IRB.CreateStore( 3163 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3164 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3165 (void)Store; 3166 DEBUG(dbgs() << " to: " << *Store << "\n"); 3167 } 3168 }; 3169 3170 bool visitStoreInst(StoreInst &SI) { 3171 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3172 return false; 3173 Value *V = SI.getValueOperand(); 3174 if (V->getType()->isSingleValueType()) 3175 return false; 3176 3177 // We have an aggregate being stored, split it apart. 3178 DEBUG(dbgs() << " original: " << SI << "\n"); 3179 StoreOpSplitter Splitter(&SI, *U); 3180 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3181 SI.eraseFromParent(); 3182 return true; 3183 } 3184 3185 bool visitBitCastInst(BitCastInst &BC) { 3186 enqueueUsers(BC); 3187 return false; 3188 } 3189 3190 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3191 enqueueUsers(GEPI); 3192 return false; 3193 } 3194 3195 bool visitPHINode(PHINode &PN) { 3196 enqueueUsers(PN); 3197 return false; 3198 } 3199 3200 bool visitSelectInst(SelectInst &SI) { 3201 enqueueUsers(SI); 3202 return false; 3203 } 3204}; 3205} 3206 3207/// \brief Strip aggregate type wrapping. 3208/// 3209/// This removes no-op aggregate types wrapping an underlying type. It will 3210/// strip as many layers of types as it can without changing either the type 3211/// size or the allocated size. 3212static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3213 if (Ty->isSingleValueType()) 3214 return Ty; 3215 3216 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3217 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3218 3219 Type *InnerTy; 3220 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3221 InnerTy = ArrTy->getElementType(); 3222 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3223 const StructLayout *SL = DL.getStructLayout(STy); 3224 unsigned Index = SL->getElementContainingOffset(0); 3225 InnerTy = STy->getElementType(Index); 3226 } else { 3227 return Ty; 3228 } 3229 3230 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3231 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3232 return Ty; 3233 3234 return stripAggregateTypeWrapping(DL, InnerTy); 3235} 3236 3237/// \brief Try to find a partition of the aggregate type passed in for a given 3238/// offset and size. 3239/// 3240/// This recurses through the aggregate type and tries to compute a subtype 3241/// based on the offset and size. When the offset and size span a sub-section 3242/// of an array, it will even compute a new array type for that sub-section, 3243/// and the same for structs. 3244/// 3245/// Note that this routine is very strict and tries to find a partition of the 3246/// type which produces the *exact* right offset and size. It is not forgiving 3247/// when the size or offset cause either end of type-based partition to be off. 3248/// Also, this is a best-effort routine. It is reasonable to give up and not 3249/// return a type if necessary. 3250static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3251 uint64_t Offset, uint64_t Size) { 3252 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3253 return stripAggregateTypeWrapping(TD, Ty); 3254 if (Offset > TD.getTypeAllocSize(Ty) || 3255 (TD.getTypeAllocSize(Ty) - Offset) < Size) 3256 return 0; 3257 3258 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3259 // We can't partition pointers... 3260 if (SeqTy->isPointerTy()) 3261 return 0; 3262 3263 Type *ElementTy = SeqTy->getElementType(); 3264 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3265 uint64_t NumSkippedElements = Offset / ElementSize; 3266 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3267 if (NumSkippedElements >= ArrTy->getNumElements()) 3268 return 0; 3269 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3270 if (NumSkippedElements >= VecTy->getNumElements()) 3271 return 0; 3272 Offset -= NumSkippedElements * ElementSize; 3273 3274 // First check if we need to recurse. 3275 if (Offset > 0 || Size < ElementSize) { 3276 // Bail if the partition ends in a different array element. 3277 if ((Offset + Size) > ElementSize) 3278 return 0; 3279 // Recurse through the element type trying to peel off offset bytes. 3280 return getTypePartition(TD, ElementTy, Offset, Size); 3281 } 3282 assert(Offset == 0); 3283 3284 if (Size == ElementSize) 3285 return stripAggregateTypeWrapping(TD, ElementTy); 3286 assert(Size > ElementSize); 3287 uint64_t NumElements = Size / ElementSize; 3288 if (NumElements * ElementSize != Size) 3289 return 0; 3290 return ArrayType::get(ElementTy, NumElements); 3291 } 3292 3293 StructType *STy = dyn_cast<StructType>(Ty); 3294 if (!STy) 3295 return 0; 3296 3297 const StructLayout *SL = TD.getStructLayout(STy); 3298 if (Offset >= SL->getSizeInBytes()) 3299 return 0; 3300 uint64_t EndOffset = Offset + Size; 3301 if (EndOffset > SL->getSizeInBytes()) 3302 return 0; 3303 3304 unsigned Index = SL->getElementContainingOffset(Offset); 3305 Offset -= SL->getElementOffset(Index); 3306 3307 Type *ElementTy = STy->getElementType(Index); 3308 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3309 if (Offset >= ElementSize) 3310 return 0; // The offset points into alignment padding. 3311 3312 // See if any partition must be contained by the element. 3313 if (Offset > 0 || Size < ElementSize) { 3314 if ((Offset + Size) > ElementSize) 3315 return 0; 3316 return getTypePartition(TD, ElementTy, Offset, Size); 3317 } 3318 assert(Offset == 0); 3319 3320 if (Size == ElementSize) 3321 return stripAggregateTypeWrapping(TD, ElementTy); 3322 3323 StructType::element_iterator EI = STy->element_begin() + Index, 3324 EE = STy->element_end(); 3325 if (EndOffset < SL->getSizeInBytes()) { 3326 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3327 if (Index == EndIndex) 3328 return 0; // Within a single element and its padding. 3329 3330 // Don't try to form "natural" types if the elements don't line up with the 3331 // expected size. 3332 // FIXME: We could potentially recurse down through the last element in the 3333 // sub-struct to find a natural end point. 3334 if (SL->getElementOffset(EndIndex) != EndOffset) 3335 return 0; 3336 3337 assert(Index < EndIndex); 3338 EE = STy->element_begin() + EndIndex; 3339 } 3340 3341 // Try to build up a sub-structure. 3342 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3343 STy->isPacked()); 3344 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3345 if (Size != SubSL->getSizeInBytes()) 3346 return 0; // The sub-struct doesn't have quite the size needed. 3347 3348 return SubTy; 3349} 3350 3351/// \brief Rewrite an alloca partition's users. 3352/// 3353/// This routine drives both of the rewriting goals of the SROA pass. It tries 3354/// to rewrite uses of an alloca partition to be conducive for SSA value 3355/// promotion. If the partition needs a new, more refined alloca, this will 3356/// build that new alloca, preserving as much type information as possible, and 3357/// rewrite the uses of the old alloca to point at the new one and have the 3358/// appropriate new offsets. It also evaluates how successful the rewrite was 3359/// at enabling promotion and if it was successful queues the alloca to be 3360/// promoted. 3361bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3362 AllocaPartitioning &P, 3363 AllocaPartitioning::iterator PI) { 3364 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3365 bool IsLive = false; 3366 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3367 UE = P.use_end(PI); 3368 UI != UE && !IsLive; ++UI) 3369 if (UI->U) 3370 IsLive = true; 3371 if (!IsLive) 3372 return false; // No live uses left of this partition. 3373 3374 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3375 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3376 3377 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3378 DEBUG(dbgs() << " speculating "); 3379 DEBUG(P.print(dbgs(), PI, "")); 3380 Speculator.visitUsers(PI); 3381 3382 // Try to compute a friendly type for this partition of the alloca. This 3383 // won't always succeed, in which case we fall back to a legal integer type 3384 // or an i8 array of an appropriate size. 3385 Type *AllocaTy = 0; 3386 if (Type *PartitionTy = P.getCommonType(PI)) 3387 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3388 AllocaTy = PartitionTy; 3389 if (!AllocaTy) 3390 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3391 PI->BeginOffset, AllocaSize)) 3392 AllocaTy = PartitionTy; 3393 if ((!AllocaTy || 3394 (AllocaTy->isArrayTy() && 3395 AllocaTy->getArrayElementType()->isIntegerTy())) && 3396 TD->isLegalInteger(AllocaSize * 8)) 3397 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3398 if (!AllocaTy) 3399 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3400 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3401 3402 // Check for the case where we're going to rewrite to a new alloca of the 3403 // exact same type as the original, and with the same access offsets. In that 3404 // case, re-use the existing alloca, but still run through the rewriter to 3405 // performe phi and select speculation. 3406 AllocaInst *NewAI; 3407 if (AllocaTy == AI.getAllocatedType()) { 3408 assert(PI->BeginOffset == 0 && 3409 "Non-zero begin offset but same alloca type"); 3410 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3411 NewAI = &AI; 3412 } else { 3413 unsigned Alignment = AI.getAlignment(); 3414 if (!Alignment) { 3415 // The minimum alignment which users can rely on when the explicit 3416 // alignment is omitted or zero is that required by the ABI for this 3417 // type. 3418 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3419 } 3420 Alignment = MinAlign(Alignment, PI->BeginOffset); 3421 // If we will get at least this much alignment from the type alone, leave 3422 // the alloca's alignment unconstrained. 3423 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3424 Alignment = 0; 3425 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3426 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3427 &AI); 3428 ++NumNewAllocas; 3429 } 3430 3431 DEBUG(dbgs() << "Rewriting alloca partition " 3432 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3433 << *NewAI << "\n"); 3434 3435 // Track the high watermark of the post-promotion worklist. We will reset it 3436 // to this point if the alloca is not in fact scheduled for promotion. 3437 unsigned PPWOldSize = PostPromotionWorklist.size(); 3438 3439 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3440 PI->BeginOffset, PI->EndOffset); 3441 DEBUG(dbgs() << " rewriting "); 3442 DEBUG(P.print(dbgs(), PI, "")); 3443 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3444 if (Promotable) { 3445 DEBUG(dbgs() << " and queuing for promotion\n"); 3446 PromotableAllocas.push_back(NewAI); 3447 } else if (NewAI != &AI) { 3448 // If we can't promote the alloca, iterate on it to check for new 3449 // refinements exposed by splitting the current alloca. Don't iterate on an 3450 // alloca which didn't actually change and didn't get promoted. 3451 Worklist.insert(NewAI); 3452 } 3453 3454 // Drop any post-promotion work items if promotion didn't happen. 3455 if (!Promotable) 3456 while (PostPromotionWorklist.size() > PPWOldSize) 3457 PostPromotionWorklist.pop_back(); 3458 3459 return true; 3460} 3461 3462/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3463bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3464 bool Changed = false; 3465 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3466 ++PI) 3467 Changed |= rewriteAllocaPartition(AI, P, PI); 3468 3469 return Changed; 3470} 3471 3472/// \brief Analyze an alloca for SROA. 3473/// 3474/// This analyzes the alloca to ensure we can reason about it, builds 3475/// a partitioning of the alloca, and then hands it off to be split and 3476/// rewritten as needed. 3477bool SROA::runOnAlloca(AllocaInst &AI) { 3478 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3479 ++NumAllocasAnalyzed; 3480 3481 // Special case dead allocas, as they're trivial. 3482 if (AI.use_empty()) { 3483 AI.eraseFromParent(); 3484 return true; 3485 } 3486 3487 // Skip alloca forms that this analysis can't handle. 3488 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3489 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3490 return false; 3491 3492 bool Changed = false; 3493 3494 // First, split any FCA loads and stores touching this alloca to promote 3495 // better splitting and promotion opportunities. 3496 AggLoadStoreRewriter AggRewriter(*TD); 3497 Changed |= AggRewriter.rewrite(AI); 3498 3499 // Build the partition set using a recursive instruction-visiting builder. 3500 AllocaPartitioning P(*TD, AI); 3501 DEBUG(P.print(dbgs())); 3502 if (P.isEscaped()) 3503 return Changed; 3504 3505 // Delete all the dead users of this alloca before splitting and rewriting it. 3506 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3507 DE = P.dead_user_end(); 3508 DI != DE; ++DI) { 3509 Changed = true; 3510 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3511 DeadInsts.insert(*DI); 3512 } 3513 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3514 DE = P.dead_op_end(); 3515 DO != DE; ++DO) { 3516 Value *OldV = **DO; 3517 // Clobber the use with an undef value. 3518 **DO = UndefValue::get(OldV->getType()); 3519 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3520 if (isInstructionTriviallyDead(OldI)) { 3521 Changed = true; 3522 DeadInsts.insert(OldI); 3523 } 3524 } 3525 3526 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3527 if (P.begin() == P.end()) 3528 return Changed; 3529 3530 return splitAlloca(AI, P) || Changed; 3531} 3532 3533/// \brief Delete the dead instructions accumulated in this run. 3534/// 3535/// Recursively deletes the dead instructions we've accumulated. This is done 3536/// at the very end to maximize locality of the recursive delete and to 3537/// minimize the problems of invalidated instruction pointers as such pointers 3538/// are used heavily in the intermediate stages of the algorithm. 3539/// 3540/// We also record the alloca instructions deleted here so that they aren't 3541/// subsequently handed to mem2reg to promote. 3542void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3543 while (!DeadInsts.empty()) { 3544 Instruction *I = DeadInsts.pop_back_val(); 3545 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3546 3547 I->replaceAllUsesWith(UndefValue::get(I->getType())); 3548 3549 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3550 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3551 // Zero out the operand and see if it becomes trivially dead. 3552 *OI = 0; 3553 if (isInstructionTriviallyDead(U)) 3554 DeadInsts.insert(U); 3555 } 3556 3557 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3558 DeletedAllocas.insert(AI); 3559 3560 ++NumDeleted; 3561 I->eraseFromParent(); 3562 } 3563} 3564 3565/// \brief Promote the allocas, using the best available technique. 3566/// 3567/// This attempts to promote whatever allocas have been identified as viable in 3568/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3569/// If there is a domtree available, we attempt to promote using the full power 3570/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3571/// based on the SSAUpdater utilities. This function returns whether any 3572/// promotion occured. 3573bool SROA::promoteAllocas(Function &F) { 3574 if (PromotableAllocas.empty()) 3575 return false; 3576 3577 NumPromoted += PromotableAllocas.size(); 3578 3579 if (DT && !ForceSSAUpdater) { 3580 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3581 PromoteMemToReg(PromotableAllocas, *DT); 3582 PromotableAllocas.clear(); 3583 return true; 3584 } 3585 3586 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3587 SSAUpdater SSA; 3588 DIBuilder DIB(*F.getParent()); 3589 SmallVector<Instruction*, 64> Insts; 3590 3591 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3592 AllocaInst *AI = PromotableAllocas[Idx]; 3593 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3594 UI != UE;) { 3595 Instruction *I = cast<Instruction>(*UI++); 3596 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3597 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3598 // leading to them) here. Eventually it should use them to optimize the 3599 // scalar values produced. 3600 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3601 assert(onlyUsedByLifetimeMarkers(I) && 3602 "Found a bitcast used outside of a lifetime marker."); 3603 while (!I->use_empty()) 3604 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3605 I->eraseFromParent(); 3606 continue; 3607 } 3608 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3609 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3610 II->getIntrinsicID() == Intrinsic::lifetime_end); 3611 II->eraseFromParent(); 3612 continue; 3613 } 3614 3615 Insts.push_back(I); 3616 } 3617 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3618 Insts.clear(); 3619 } 3620 3621 PromotableAllocas.clear(); 3622 return true; 3623} 3624 3625namespace { 3626 /// \brief A predicate to test whether an alloca belongs to a set. 3627 class IsAllocaInSet { 3628 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3629 const SetType &Set; 3630 3631 public: 3632 typedef AllocaInst *argument_type; 3633 3634 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3635 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3636 }; 3637} 3638 3639bool SROA::runOnFunction(Function &F) { 3640 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3641 C = &F.getContext(); 3642 TD = getAnalysisIfAvailable<DataLayout>(); 3643 if (!TD) { 3644 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3645 return false; 3646 } 3647 DT = getAnalysisIfAvailable<DominatorTree>(); 3648 3649 BasicBlock &EntryBB = F.getEntryBlock(); 3650 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3651 I != E; ++I) 3652 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3653 Worklist.insert(AI); 3654 3655 bool Changed = false; 3656 // A set of deleted alloca instruction pointers which should be removed from 3657 // the list of promotable allocas. 3658 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3659 3660 do { 3661 while (!Worklist.empty()) { 3662 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3663 deleteDeadInstructions(DeletedAllocas); 3664 3665 // Remove the deleted allocas from various lists so that we don't try to 3666 // continue processing them. 3667 if (!DeletedAllocas.empty()) { 3668 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3669 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3670 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3671 PromotableAllocas.end(), 3672 IsAllocaInSet(DeletedAllocas)), 3673 PromotableAllocas.end()); 3674 DeletedAllocas.clear(); 3675 } 3676 } 3677 3678 Changed |= promoteAllocas(F); 3679 3680 Worklist = PostPromotionWorklist; 3681 PostPromotionWorklist.clear(); 3682 } while (!Worklist.empty()); 3683 3684 return Changed; 3685} 3686 3687void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3688 if (RequiresDomTree) 3689 AU.addRequired<DominatorTree>(); 3690 AU.setPreservesCFG(); 3691} 3692