SROA.cpp revision 98281a20503896349bd152e2dfe87435d3a6aada
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/ADT/STLExtras.h" 29#include "llvm/ADT/SetVector.h" 30#include "llvm/ADT/SmallVector.h" 31#include "llvm/ADT/Statistic.h" 32#include "llvm/Analysis/Dominators.h" 33#include "llvm/Analysis/Loads.h" 34#include "llvm/Analysis/PtrUseVisitor.h" 35#include "llvm/Analysis/ValueTracking.h" 36#include "llvm/Constants.h" 37#include "llvm/DIBuilder.h" 38#include "llvm/DataLayout.h" 39#include "llvm/DebugInfo.h" 40#include "llvm/DerivedTypes.h" 41#include "llvm/Function.h" 42#include "llvm/IRBuilder.h" 43#include "llvm/InstVisitor.h" 44#include "llvm/Instructions.h" 45#include "llvm/IntrinsicInst.h" 46#include "llvm/LLVMContext.h" 47#include "llvm/Module.h" 48#include "llvm/Operator.h" 49#include "llvm/Pass.h" 50#include "llvm/Support/CommandLine.h" 51#include "llvm/Support/Debug.h" 52#include "llvm/Support/ErrorHandling.h" 53#include "llvm/Support/GetElementPtrTypeIterator.h" 54#include "llvm/Support/MathExtras.h" 55#include "llvm/Support/raw_ostream.h" 56#include "llvm/Transforms/Utils/Local.h" 57#include "llvm/Transforms/Utils/PromoteMemToReg.h" 58#include "llvm/Transforms/Utils/SSAUpdater.h" 59using namespace llvm; 60 61STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 62STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 63STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 64STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 65STATISTIC(NumDeleted, "Number of instructions deleted"); 66STATISTIC(NumVectorized, "Number of vectorized aggregates"); 67 68/// Hidden option to force the pass to not use DomTree and mem2reg, instead 69/// forming SSA values through the SSAUpdater infrastructure. 70static cl::opt<bool> 71ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 72 73namespace { 74/// \brief Alloca partitioning representation. 75/// 76/// This class represents a partitioning of an alloca into slices, and 77/// information about the nature of uses of each slice of the alloca. The goal 78/// is that this information is sufficient to decide if and how to split the 79/// alloca apart and replace slices with scalars. It is also intended that this 80/// structure can capture the relevant information needed both to decide about 81/// and to enact these transformations. 82class AllocaPartitioning { 83public: 84 /// \brief A common base class for representing a half-open byte range. 85 struct ByteRange { 86 /// \brief The beginning offset of the range. 87 uint64_t BeginOffset; 88 89 /// \brief The ending offset, not included in the range. 90 uint64_t EndOffset; 91 92 ByteRange() : BeginOffset(), EndOffset() {} 93 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 94 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 95 96 /// \brief Support for ordering ranges. 97 /// 98 /// This provides an ordering over ranges such that start offsets are 99 /// always increasing, and within equal start offsets, the end offsets are 100 /// decreasing. Thus the spanning range comes first in a cluster with the 101 /// same start position. 102 bool operator<(const ByteRange &RHS) const { 103 if (BeginOffset < RHS.BeginOffset) return true; 104 if (BeginOffset > RHS.BeginOffset) return false; 105 if (EndOffset > RHS.EndOffset) return true; 106 return false; 107 } 108 109 /// \brief Support comparison with a single offset to allow binary searches. 110 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 111 return LHS.BeginOffset < RHSOffset; 112 } 113 114 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 115 const ByteRange &RHS) { 116 return LHSOffset < RHS.BeginOffset; 117 } 118 119 bool operator==(const ByteRange &RHS) const { 120 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 121 } 122 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 123 }; 124 125 /// \brief A partition of an alloca. 126 /// 127 /// This structure represents a contiguous partition of the alloca. These are 128 /// formed by examining the uses of the alloca. During formation, they may 129 /// overlap but once an AllocaPartitioning is built, the Partitions within it 130 /// are all disjoint. 131 struct Partition : public ByteRange { 132 /// \brief Whether this partition is splittable into smaller partitions. 133 /// 134 /// We flag partitions as splittable when they are formed entirely due to 135 /// accesses by trivially splittable operations such as memset and memcpy. 136 bool IsSplittable; 137 138 /// \brief Test whether a partition has been marked as dead. 139 bool isDead() const { 140 if (BeginOffset == UINT64_MAX) { 141 assert(EndOffset == UINT64_MAX); 142 return true; 143 } 144 return false; 145 } 146 147 /// \brief Kill a partition. 148 /// This is accomplished by setting both its beginning and end offset to 149 /// the maximum possible value. 150 void kill() { 151 assert(!isDead() && "He's Dead, Jim!"); 152 BeginOffset = EndOffset = UINT64_MAX; 153 } 154 155 Partition() : ByteRange(), IsSplittable() {} 156 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 157 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 158 }; 159 160 /// \brief A particular use of a partition of the alloca. 161 /// 162 /// This structure is used to associate uses of a partition with it. They 163 /// mark the range of bytes which are referenced by a particular instruction, 164 /// and includes a handle to the user itself and the pointer value in use. 165 /// The bounds of these uses are determined by intersecting the bounds of the 166 /// memory use itself with a particular partition. As a consequence there is 167 /// intentionally overlap between various uses of the same partition. 168 struct PartitionUse : public ByteRange { 169 /// \brief The use in question. Provides access to both user and used value. 170 /// 171 /// Note that this may be null if the partition use is *dead*, that is, it 172 /// should be ignored. 173 Use *U; 174 175 PartitionUse() : ByteRange(), U() {} 176 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) 177 : ByteRange(BeginOffset, EndOffset), U(U) {} 178 }; 179 180 /// \brief Construct a partitioning of a particular alloca. 181 /// 182 /// Construction does most of the work for partitioning the alloca. This 183 /// performs the necessary walks of users and builds a partitioning from it. 184 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 185 186 /// \brief Test whether a pointer to the allocation escapes our analysis. 187 /// 188 /// If this is true, the partitioning is never fully built and should be 189 /// ignored. 190 bool isEscaped() const { return PointerEscapingInstr; } 191 192 /// \brief Support for iterating over the partitions. 193 /// @{ 194 typedef SmallVectorImpl<Partition>::iterator iterator; 195 iterator begin() { return Partitions.begin(); } 196 iterator end() { return Partitions.end(); } 197 198 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 199 const_iterator begin() const { return Partitions.begin(); } 200 const_iterator end() const { return Partitions.end(); } 201 /// @} 202 203 /// \brief Support for iterating over and manipulating a particular 204 /// partition's uses. 205 /// 206 /// The iteration support provided for uses is more limited, but also 207 /// includes some manipulation routines to support rewriting the uses of 208 /// partitions during SROA. 209 /// @{ 210 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 211 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 212 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 213 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 214 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 215 216 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 217 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 218 const_use_iterator use_begin(const_iterator I) const { 219 return Uses[I - begin()].begin(); 220 } 221 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 222 const_use_iterator use_end(const_iterator I) const { 223 return Uses[I - begin()].end(); 224 } 225 226 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 227 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 228 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 229 return Uses[PIdx][UIdx]; 230 } 231 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 232 return Uses[I - begin()][UIdx]; 233 } 234 235 void use_push_back(unsigned Idx, const PartitionUse &PU) { 236 Uses[Idx].push_back(PU); 237 } 238 void use_push_back(const_iterator I, const PartitionUse &PU) { 239 Uses[I - begin()].push_back(PU); 240 } 241 /// @} 242 243 /// \brief Allow iterating the dead users for this alloca. 244 /// 245 /// These are instructions which will never actually use the alloca as they 246 /// are outside the allocated range. They are safe to replace with undef and 247 /// delete. 248 /// @{ 249 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 250 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 251 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 252 /// @} 253 254 /// \brief Allow iterating the dead expressions referring to this alloca. 255 /// 256 /// These are operands which have cannot actually be used to refer to the 257 /// alloca as they are outside its range and the user doesn't correct for 258 /// that. These mostly consist of PHI node inputs and the like which we just 259 /// need to replace with undef. 260 /// @{ 261 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 262 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 263 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 264 /// @} 265 266 /// \brief MemTransferInst auxiliary data. 267 /// This struct provides some auxiliary data about memory transfer 268 /// intrinsics such as memcpy and memmove. These intrinsics can use two 269 /// different ranges within the same alloca, and provide other challenges to 270 /// correctly represent. We stash extra data to help us untangle this 271 /// after the partitioning is complete. 272 struct MemTransferOffsets { 273 /// The destination begin and end offsets when the destination is within 274 /// this alloca. If the end offset is zero the destination is not within 275 /// this alloca. 276 uint64_t DestBegin, DestEnd; 277 278 /// The source begin and end offsets when the source is within this alloca. 279 /// If the end offset is zero, the source is not within this alloca. 280 uint64_t SourceBegin, SourceEnd; 281 282 /// Flag for whether an alloca is splittable. 283 bool IsSplittable; 284 }; 285 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 286 return MemTransferInstData.lookup(&II); 287 } 288 289 /// \brief Map from a PHI or select operand back to a partition. 290 /// 291 /// When manipulating PHI nodes or selects, they can use more than one 292 /// partition of an alloca. We store a special mapping to allow finding the 293 /// partition referenced by each of these operands, if any. 294 iterator findPartitionForPHIOrSelectOperand(Use *U) { 295 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 296 = PHIOrSelectOpMap.find(U); 297 if (MapIt == PHIOrSelectOpMap.end()) 298 return end(); 299 300 return begin() + MapIt->second.first; 301 } 302 303 /// \brief Map from a PHI or select operand back to the specific use of 304 /// a partition. 305 /// 306 /// Similar to mapping these operands back to the partitions, this maps 307 /// directly to the use structure of that partition. 308 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 309 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 310 = PHIOrSelectOpMap.find(U); 311 assert(MapIt != PHIOrSelectOpMap.end()); 312 return Uses[MapIt->second.first].begin() + MapIt->second.second; 313 } 314 315 /// \brief Compute a common type among the uses of a particular partition. 316 /// 317 /// This routines walks all of the uses of a particular partition and tries 318 /// to find a common type between them. Untyped operations such as memset and 319 /// memcpy are ignored. 320 Type *getCommonType(iterator I) const; 321 322#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 323 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 324 void printUsers(raw_ostream &OS, const_iterator I, 325 StringRef Indent = " ") const; 326 void print(raw_ostream &OS) const; 327 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 328 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 329#endif 330 331private: 332 template <typename DerivedT, typename RetT = void> class BuilderBase; 333 class PartitionBuilder; 334 friend class AllocaPartitioning::PartitionBuilder; 335 class UseBuilder; 336 friend class AllocaPartitioning::UseBuilder; 337 338#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 339 /// \brief Handle to alloca instruction to simplify method interfaces. 340 AllocaInst &AI; 341#endif 342 343 /// \brief The instruction responsible for this alloca having no partitioning. 344 /// 345 /// When an instruction (potentially) escapes the pointer to the alloca, we 346 /// store a pointer to that here and abort trying to partition the alloca. 347 /// This will be null if the alloca is partitioned successfully. 348 Instruction *PointerEscapingInstr; 349 350 /// \brief The partitions of the alloca. 351 /// 352 /// We store a vector of the partitions over the alloca here. This vector is 353 /// sorted by increasing begin offset, and then by decreasing end offset. See 354 /// the Partition inner class for more details. Initially (during 355 /// construction) there are overlaps, but we form a disjoint sequence of 356 /// partitions while finishing construction and a fully constructed object is 357 /// expected to always have this as a disjoint space. 358 SmallVector<Partition, 8> Partitions; 359 360 /// \brief The uses of the partitions. 361 /// 362 /// This is essentially a mapping from each partition to a list of uses of 363 /// that partition. The mapping is done with a Uses vector that has the exact 364 /// same number of entries as the partition vector. Each entry is itself 365 /// a vector of the uses. 366 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 367 368 /// \brief Instructions which will become dead if we rewrite the alloca. 369 /// 370 /// Note that these are not separated by partition. This is because we expect 371 /// a partitioned alloca to be completely rewritten or not rewritten at all. 372 /// If rewritten, all these instructions can simply be removed and replaced 373 /// with undef as they come from outside of the allocated space. 374 SmallVector<Instruction *, 8> DeadUsers; 375 376 /// \brief Operands which will become dead if we rewrite the alloca. 377 /// 378 /// These are operands that in their particular use can be replaced with 379 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 380 /// to PHI nodes and the like. They aren't entirely dead (there might be 381 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 382 /// want to swap this particular input for undef to simplify the use lists of 383 /// the alloca. 384 SmallVector<Use *, 8> DeadOperands; 385 386 /// \brief The underlying storage for auxiliary memcpy and memset info. 387 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 388 389 /// \brief A side datastructure used when building up the partitions and uses. 390 /// 391 /// This mapping is only really used during the initial building of the 392 /// partitioning so that we can retain information about PHI and select nodes 393 /// processed. 394 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 395 396 /// \brief Auxiliary information for particular PHI or select operands. 397 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 398 399 /// \brief A utility routine called from the constructor. 400 /// 401 /// This does what it says on the tin. It is the key of the alloca partition 402 /// splitting and merging. After it is called we have the desired disjoint 403 /// collection of partitions. 404 void splitAndMergePartitions(); 405}; 406} 407 408static Value *foldSelectInst(SelectInst &SI) { 409 // If the condition being selected on is a constant or the same value is 410 // being selected between, fold the select. Yes this does (rarely) happen 411 // early on. 412 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 413 return SI.getOperand(1+CI->isZero()); 414 if (SI.getOperand(1) == SI.getOperand(2)) { 415 return SI.getOperand(1); 416 } 417 return 0; 418} 419 420/// \brief Builder for the alloca partitioning. 421/// 422/// This class builds an alloca partitioning by recursively visiting the uses 423/// of an alloca and splitting the partitions for each load and store at each 424/// offset. 425class AllocaPartitioning::PartitionBuilder 426 : public PtrUseVisitor<PartitionBuilder> { 427 friend class PtrUseVisitor<PartitionBuilder>; 428 friend class InstVisitor<PartitionBuilder>; 429 typedef PtrUseVisitor<PartitionBuilder> Base; 430 431 const uint64_t AllocSize; 432 AllocaPartitioning &P; 433 434 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 435 436public: 437 PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) 438 : PtrUseVisitor<PartitionBuilder>(DL), 439 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), 440 P(P) {} 441 442private: 443 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 444 bool IsSplittable = false) { 445 // Completely skip uses which have a zero size or start either before or 446 // past the end of the allocation. 447 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { 448 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 449 << " which has zero size or starts outside of the " 450 << AllocSize << " byte alloca:\n" 451 << " alloca: " << P.AI << "\n" 452 << " use: " << I << "\n"); 453 return; 454 } 455 456 uint64_t BeginOffset = Offset.getZExtValue(); 457 uint64_t EndOffset = BeginOffset + Size; 458 459 // Clamp the end offset to the end of the allocation. Note that this is 460 // formulated to handle even the case where "BeginOffset + Size" overflows. 461 // NOTE! This may appear superficially to be something we could ignore 462 // entirely, but that is not so! There may be PHI-node uses where some 463 // instructions are dead but not others. We can't completely ignore the 464 // PHI node, and so have to record at least the information here. 465 assert(AllocSize >= BeginOffset); // Established above. 466 if (Size > AllocSize - BeginOffset) { 467 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 468 << " to remain within the " << AllocSize << " byte alloca:\n" 469 << " alloca: " << P.AI << "\n" 470 << " use: " << I << "\n"); 471 EndOffset = AllocSize; 472 } 473 474 Partition New(BeginOffset, EndOffset, IsSplittable); 475 P.Partitions.push_back(New); 476 } 477 478 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 479 bool IsVolatile) { 480 uint64_t Size = DL.getTypeStoreSize(Ty); 481 482 // If this memory access can be shown to *statically* extend outside the 483 // bounds of of the allocation, it's behavior is undefined, so simply 484 // ignore it. Note that this is more strict than the generic clamping 485 // behavior of insertUse. We also try to handle cases which might run the 486 // risk of overflow. 487 // FIXME: We should instead consider the pointer to have escaped if this 488 // function is being instrumented for addressing bugs or race conditions. 489 if (Offset.isNegative() || Size > AllocSize || 490 Offset.ugt(AllocSize - Size)) { 491 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 492 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 493 << " which extends past the end of the " << AllocSize 494 << " byte alloca:\n" 495 << " alloca: " << P.AI << "\n" 496 << " use: " << I << "\n"); 497 return; 498 } 499 500 // We allow splitting of loads and stores where the type is an integer type 501 // and which cover the entire alloca. Such integer loads and stores 502 // often require decomposition into fine grained loads and stores. 503 bool IsSplittable = false; 504 if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) 505 IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; 506 507 insertUse(I, Offset, Size, IsSplittable); 508 } 509 510 void visitLoadInst(LoadInst &LI) { 511 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 512 "All simple FCA loads should have been pre-split"); 513 514 if (!IsOffsetKnown) 515 return PI.setAborted(&LI); 516 517 return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); 518 } 519 520 void visitStoreInst(StoreInst &SI) { 521 Value *ValOp = SI.getValueOperand(); 522 if (ValOp == *U) 523 return PI.setEscapedAndAborted(&SI); 524 if (!IsOffsetKnown) 525 return PI.setAborted(&SI); 526 527 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 528 "All simple FCA stores should have been pre-split"); 529 handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); 530 } 531 532 533 void visitMemSetInst(MemSetInst &II) { 534 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 535 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 536 if ((Length && Length->getValue() == 0) || 537 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 538 // Zero-length mem transfer intrinsics can be ignored entirely. 539 return; 540 541 if (!IsOffsetKnown) 542 return PI.setAborted(&II); 543 544 insertUse(II, Offset, 545 Length ? Length->getLimitedValue() 546 : AllocSize - Offset.getLimitedValue(), 547 (bool)Length); 548 } 549 550 void visitMemTransferInst(MemTransferInst &II) { 551 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 552 if ((Length && Length->getValue() == 0) || 553 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 554 // Zero-length mem transfer intrinsics can be ignored entirely. 555 return; 556 557 if (!IsOffsetKnown) 558 return PI.setAborted(&II); 559 560 uint64_t RawOffset = Offset.getLimitedValue(); 561 uint64_t Size = Length ? Length->getLimitedValue() 562 : AllocSize - RawOffset; 563 564 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 565 566 // Only intrinsics with a constant length can be split. 567 Offsets.IsSplittable = Length; 568 569 if (*U == II.getRawDest()) { 570 Offsets.DestBegin = RawOffset; 571 Offsets.DestEnd = RawOffset + Size; 572 } 573 if (*U == II.getRawSource()) { 574 Offsets.SourceBegin = RawOffset; 575 Offsets.SourceEnd = RawOffset + Size; 576 } 577 578 // If we have set up end offsets for both the source and the destination, 579 // we have found both sides of this transfer pointing at the same alloca. 580 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 581 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 582 unsigned PrevIdx = MemTransferPartitionMap[&II]; 583 584 // Check if the begin offsets match and this is a non-volatile transfer. 585 // In that case, we can completely elide the transfer. 586 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 587 P.Partitions[PrevIdx].kill(); 588 return; 589 } 590 591 // Otherwise we have an offset transfer within the same alloca. We can't 592 // split those. 593 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 594 } else if (SeenBothEnds) { 595 // Handle the case where this exact use provides both ends of the 596 // operation. 597 assert(II.getRawDest() == II.getRawSource()); 598 599 // For non-volatile transfers this is a no-op. 600 if (!II.isVolatile()) 601 return; 602 603 // Otherwise just suppress splitting. 604 Offsets.IsSplittable = false; 605 } 606 607 608 // Insert the use now that we've fixed up the splittable nature. 609 insertUse(II, Offset, Size, Offsets.IsSplittable); 610 611 // Setup the mapping from intrinsic to partition of we've not seen both 612 // ends of this transfer. 613 if (!SeenBothEnds) { 614 unsigned NewIdx = P.Partitions.size() - 1; 615 bool Inserted 616 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 617 assert(Inserted && 618 "Already have intrinsic in map but haven't seen both ends"); 619 (void)Inserted; 620 } 621 } 622 623 // Disable SRoA for any intrinsics except for lifetime invariants. 624 // FIXME: What about debug instrinsics? This matches old behavior, but 625 // doesn't make sense. 626 void visitIntrinsicInst(IntrinsicInst &II) { 627 if (!IsOffsetKnown) 628 return PI.setAborted(&II); 629 630 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 631 II.getIntrinsicID() == Intrinsic::lifetime_end) { 632 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 633 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 634 Length->getLimitedValue()); 635 insertUse(II, Offset, Size, true); 636 return; 637 } 638 639 Base::visitIntrinsicInst(II); 640 } 641 642 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 643 // We consider any PHI or select that results in a direct load or store of 644 // the same offset to be a viable use for partitioning purposes. These uses 645 // are considered unsplittable and the size is the maximum loaded or stored 646 // size. 647 SmallPtrSet<Instruction *, 4> Visited; 648 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 649 Visited.insert(Root); 650 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 651 // If there are no loads or stores, the access is dead. We mark that as 652 // a size zero access. 653 Size = 0; 654 do { 655 Instruction *I, *UsedI; 656 llvm::tie(UsedI, I) = Uses.pop_back_val(); 657 658 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 659 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 660 continue; 661 } 662 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 663 Value *Op = SI->getOperand(0); 664 if (Op == UsedI) 665 return SI; 666 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 667 continue; 668 } 669 670 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 671 if (!GEP->hasAllZeroIndices()) 672 return GEP; 673 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 674 !isa<SelectInst>(I)) { 675 return I; 676 } 677 678 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 679 ++UI) 680 if (Visited.insert(cast<Instruction>(*UI))) 681 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 682 } while (!Uses.empty()); 683 684 return 0; 685 } 686 687 void visitPHINode(PHINode &PN) { 688 if (PN.use_empty()) 689 return; 690 if (!IsOffsetKnown) 691 return PI.setAborted(&PN); 692 693 // See if we already have computed info on this node. 694 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 695 if (PHIInfo.first) { 696 PHIInfo.second = true; 697 insertUse(PN, Offset, PHIInfo.first); 698 return; 699 } 700 701 // Check for an unsafe use of the PHI node. 702 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 703 return PI.setAborted(UnsafeI); 704 705 insertUse(PN, Offset, PHIInfo.first); 706 } 707 708 void visitSelectInst(SelectInst &SI) { 709 if (SI.use_empty()) 710 return; 711 if (Value *Result = foldSelectInst(SI)) { 712 if (Result == *U) 713 // If the result of the constant fold will be the pointer, recurse 714 // through the select as if we had RAUW'ed it. 715 enqueueUsers(SI); 716 717 return; 718 } 719 if (!IsOffsetKnown) 720 return PI.setAborted(&SI); 721 722 // See if we already have computed info on this node. 723 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 724 if (SelectInfo.first) { 725 SelectInfo.second = true; 726 insertUse(SI, Offset, SelectInfo.first); 727 return; 728 } 729 730 // Check for an unsafe use of the PHI node. 731 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 732 return PI.setAborted(UnsafeI); 733 734 insertUse(SI, Offset, SelectInfo.first); 735 } 736 737 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 738 void visitInstruction(Instruction &I) { 739 PI.setAborted(&I); 740 } 741}; 742 743/// \brief Use adder for the alloca partitioning. 744/// 745/// This class adds the uses of an alloca to all of the partitions which they 746/// use. For splittable partitions, this can end up doing essentially a linear 747/// walk of the partitions, but the number of steps remains bounded by the 748/// total result instruction size: 749/// - The number of partitions is a result of the number unsplittable 750/// instructions using the alloca. 751/// - The number of users of each partition is at worst the total number of 752/// splittable instructions using the alloca. 753/// Thus we will produce N * M instructions in the end, where N are the number 754/// of unsplittable uses and M are the number of splittable. This visitor does 755/// the exact same number of updates to the partitioning. 756/// 757/// In the more common case, this visitor will leverage the fact that the 758/// partition space is pre-sorted, and do a logarithmic search for the 759/// partition needed, making the total visit a classical ((N + M) * log(N)) 760/// complexity operation. 761class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { 762 friend class PtrUseVisitor<UseBuilder>; 763 friend class InstVisitor<UseBuilder>; 764 typedef PtrUseVisitor<UseBuilder> Base; 765 766 const uint64_t AllocSize; 767 AllocaPartitioning &P; 768 769 /// \brief Set to de-duplicate dead instructions found in the use walk. 770 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 771 772public: 773 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 774 : PtrUseVisitor<UseBuilder>(TD), 775 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 776 P(P) {} 777 778private: 779 void markAsDead(Instruction &I) { 780 if (VisitedDeadInsts.insert(&I)) 781 P.DeadUsers.push_back(&I); 782 } 783 784 void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { 785 // If the use has a zero size or extends outside of the allocation, record 786 // it as a dead use for elimination later. 787 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) 788 return markAsDead(User); 789 790 uint64_t BeginOffset = Offset.getZExtValue(); 791 uint64_t EndOffset = BeginOffset + Size; 792 793 // Clamp the end offset to the end of the allocation. Note that this is 794 // formulated to handle even the case where "BeginOffset + Size" overflows. 795 assert(AllocSize >= BeginOffset); // Established above. 796 if (Size > AllocSize - BeginOffset) 797 EndOffset = AllocSize; 798 799 // NB: This only works if we have zero overlapping partitions. 800 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 801 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 802 B = llvm::prior(B); 803 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 804 ++I) { 805 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 806 std::min(I->EndOffset, EndOffset), U); 807 P.use_push_back(I, NewPU); 808 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 809 P.PHIOrSelectOpMap[U] 810 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 811 } 812 } 813 814 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) { 815 uint64_t Size = DL.getTypeStoreSize(Ty); 816 817 // If this memory access can be shown to *statically* extend outside the 818 // bounds of of the allocation, it's behavior is undefined, so simply 819 // ignore it. Note that this is more strict than the generic clamping 820 // behavior of insertUse. 821 if (Offset.isNegative() || Size > AllocSize || 822 Offset.ugt(AllocSize - Size)) 823 return markAsDead(I); 824 825 insertUse(I, Offset, Size); 826 } 827 828 void visitBitCastInst(BitCastInst &BC) { 829 if (BC.use_empty()) 830 return markAsDead(BC); 831 832 return Base::visitBitCastInst(BC); 833 } 834 835 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 836 if (GEPI.use_empty()) 837 return markAsDead(GEPI); 838 839 return Base::visitGetElementPtrInst(GEPI); 840 } 841 842 void visitLoadInst(LoadInst &LI) { 843 assert(IsOffsetKnown); 844 handleLoadOrStore(LI.getType(), LI, Offset); 845 } 846 847 void visitStoreInst(StoreInst &SI) { 848 assert(IsOffsetKnown); 849 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 850 } 851 852 void visitMemSetInst(MemSetInst &II) { 853 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 854 if ((Length && Length->getValue() == 0) || 855 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 856 return markAsDead(II); 857 858 assert(IsOffsetKnown); 859 insertUse(II, Offset, Length ? Length->getLimitedValue() 860 : AllocSize - Offset.getLimitedValue()); 861 } 862 863 void visitMemTransferInst(MemTransferInst &II) { 864 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 865 if ((Length && Length->getValue() == 0) || 866 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 867 return markAsDead(II); 868 869 assert(IsOffsetKnown); 870 uint64_t Size = Length ? Length->getLimitedValue() 871 : AllocSize - Offset.getLimitedValue(); 872 873 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 874 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 875 Offsets.DestBegin == Offsets.SourceBegin) 876 return markAsDead(II); // Skip identity transfers without side-effects. 877 878 insertUse(II, Offset, Size); 879 } 880 881 void visitIntrinsicInst(IntrinsicInst &II) { 882 assert(IsOffsetKnown); 883 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 884 II.getIntrinsicID() == Intrinsic::lifetime_end); 885 886 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 887 insertUse(II, Offset, std::min(Length->getLimitedValue(), 888 AllocSize - Offset.getLimitedValue())); 889 } 890 891 void insertPHIOrSelect(Instruction &User, const APInt &Offset) { 892 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 893 894 // For PHI and select operands outside the alloca, we can't nuke the entire 895 // phi or select -- the other side might still be relevant, so we special 896 // case them here and use a separate structure to track the operands 897 // themselves which should be replaced with undef. 898 if ((Offset.isNegative() && Offset.uge(Size)) || 899 (!Offset.isNegative() && Offset.uge(AllocSize))) { 900 P.DeadOperands.push_back(U); 901 return; 902 } 903 904 insertUse(User, Offset, Size); 905 } 906 907 void visitPHINode(PHINode &PN) { 908 if (PN.use_empty()) 909 return markAsDead(PN); 910 911 assert(IsOffsetKnown); 912 insertPHIOrSelect(PN, Offset); 913 } 914 915 void visitSelectInst(SelectInst &SI) { 916 if (SI.use_empty()) 917 return markAsDead(SI); 918 919 if (Value *Result = foldSelectInst(SI)) { 920 if (Result == *U) 921 // If the result of the constant fold will be the pointer, recurse 922 // through the select as if we had RAUW'ed it. 923 enqueueUsers(SI); 924 else 925 // Otherwise the operand to the select is dead, and we can replace it 926 // with undef. 927 P.DeadOperands.push_back(U); 928 929 return; 930 } 931 932 assert(IsOffsetKnown); 933 insertPHIOrSelect(SI, Offset); 934 } 935 936 /// \brief Unreachable, we've already visited the alloca once. 937 void visitInstruction(Instruction &I) { 938 llvm_unreachable("Unhandled instruction in use builder."); 939 } 940}; 941 942void AllocaPartitioning::splitAndMergePartitions() { 943 size_t NumDeadPartitions = 0; 944 945 // Track the range of splittable partitions that we pass when accumulating 946 // overlapping unsplittable partitions. 947 uint64_t SplitEndOffset = 0ull; 948 949 Partition New(0ull, 0ull, false); 950 951 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 952 ++j; 953 954 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 955 assert(New.BeginOffset == New.EndOffset); 956 New = Partitions[i]; 957 } else { 958 assert(New.IsSplittable); 959 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 960 } 961 assert(New.BeginOffset != New.EndOffset); 962 963 // Scan the overlapping partitions. 964 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 965 // If the new partition we are forming is splittable, stop at the first 966 // unsplittable partition. 967 if (New.IsSplittable && !Partitions[j].IsSplittable) 968 break; 969 970 // Grow the new partition to include any equally splittable range. 'j' is 971 // always equally splittable when New is splittable, but when New is not 972 // splittable, we may subsume some (or part of some) splitable partition 973 // without growing the new one. 974 if (New.IsSplittable == Partitions[j].IsSplittable) { 975 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 976 } else { 977 assert(!New.IsSplittable); 978 assert(Partitions[j].IsSplittable); 979 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 980 } 981 982 Partitions[j].kill(); 983 ++NumDeadPartitions; 984 ++j; 985 } 986 987 // If the new partition is splittable, chop off the end as soon as the 988 // unsplittable subsequent partition starts and ensure we eventually cover 989 // the splittable area. 990 if (j != e && New.IsSplittable) { 991 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 992 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 993 } 994 995 // Add the new partition if it differs from the original one and is 996 // non-empty. We can end up with an empty partition here if it was 997 // splittable but there is an unsplittable one that starts at the same 998 // offset. 999 if (New != Partitions[i]) { 1000 if (New.BeginOffset != New.EndOffset) 1001 Partitions.push_back(New); 1002 // Mark the old one for removal. 1003 Partitions[i].kill(); 1004 ++NumDeadPartitions; 1005 } 1006 1007 New.BeginOffset = New.EndOffset; 1008 if (!New.IsSplittable) { 1009 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1010 if (j != e && !Partitions[j].IsSplittable) 1011 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1012 New.IsSplittable = true; 1013 // If there is a trailing splittable partition which won't be fused into 1014 // the next splittable partition go ahead and add it onto the partitions 1015 // list. 1016 if (New.BeginOffset < New.EndOffset && 1017 (j == e || !Partitions[j].IsSplittable || 1018 New.EndOffset < Partitions[j].BeginOffset)) { 1019 Partitions.push_back(New); 1020 New.BeginOffset = New.EndOffset = 0ull; 1021 } 1022 } 1023 } 1024 1025 // Re-sort the partitions now that they have been split and merged into 1026 // disjoint set of partitions. Also remove any of the dead partitions we've 1027 // replaced in the process. 1028 std::sort(Partitions.begin(), Partitions.end()); 1029 if (NumDeadPartitions) { 1030 assert(Partitions.back().isDead()); 1031 assert((ptrdiff_t)NumDeadPartitions == 1032 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1033 } 1034 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1035} 1036 1037AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1038 : 1039#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1040 AI(AI), 1041#endif 1042 PointerEscapingInstr(0) { 1043 PartitionBuilder PB(TD, AI, *this); 1044 PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1045 if (PtrI.isEscaped() || PtrI.isAborted()) { 1046 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1047 // possibly by just storing the PtrInfo in the AllocaPartitioning. 1048 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1049 : PtrI.getAbortingInst(); 1050 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1051 return; 1052 } 1053 1054 // Sort the uses. This arranges for the offsets to be in ascending order, 1055 // and the sizes to be in descending order. 1056 std::sort(Partitions.begin(), Partitions.end()); 1057 1058 // Remove any partitions from the back which are marked as dead. 1059 while (!Partitions.empty() && Partitions.back().isDead()) 1060 Partitions.pop_back(); 1061 1062 if (Partitions.size() > 1) { 1063 // Intersect splittability for all partitions with equal offsets and sizes. 1064 // Then remove all but the first so that we have a sequence of non-equal but 1065 // potentially overlapping partitions. 1066 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1067 I = J) { 1068 ++J; 1069 while (J != E && *I == *J) { 1070 I->IsSplittable &= J->IsSplittable; 1071 ++J; 1072 } 1073 } 1074 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1075 Partitions.end()); 1076 1077 // Split splittable and merge unsplittable partitions into a disjoint set 1078 // of partitions over the used space of the allocation. 1079 splitAndMergePartitions(); 1080 } 1081 1082 // Now build up the user lists for each of these disjoint partitions by 1083 // re-walking the recursive users of the alloca. 1084 Uses.resize(Partitions.size()); 1085 UseBuilder UB(TD, AI, *this); 1086 PtrI = UB.visitPtr(AI); 1087 assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); 1088 assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); 1089} 1090 1091Type *AllocaPartitioning::getCommonType(iterator I) const { 1092 Type *Ty = 0; 1093 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1094 if (!UI->U) 1095 continue; // Skip dead uses. 1096 if (isa<IntrinsicInst>(*UI->U->getUser())) 1097 continue; 1098 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1099 continue; 1100 1101 Type *UserTy = 0; 1102 if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { 1103 UserTy = LI->getType(); 1104 } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { 1105 UserTy = SI->getValueOperand()->getType(); 1106 } else { 1107 return 0; // Bail if we have weird uses. 1108 } 1109 1110 if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { 1111 // If the type is larger than the partition, skip it. We only encounter 1112 // this for split integer operations where we want to use the type of the 1113 // entity causing the split. 1114 if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) 1115 continue; 1116 1117 // If we have found an integer type use covering the alloca, use that 1118 // regardless of the other types, as integers are often used for a "bucket 1119 // of bits" type. 1120 return ITy; 1121 } 1122 1123 if (Ty && Ty != UserTy) 1124 return 0; 1125 1126 Ty = UserTy; 1127 } 1128 return Ty; 1129} 1130 1131#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1132 1133void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1134 StringRef Indent) const { 1135 OS << Indent << "partition #" << (I - begin()) 1136 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1137 << (I->IsSplittable ? " (splittable)" : "") 1138 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1139 << "\n"; 1140} 1141 1142void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1143 StringRef Indent) const { 1144 for (const_use_iterator UI = use_begin(I), UE = use_end(I); 1145 UI != UE; ++UI) { 1146 if (!UI->U) 1147 continue; // Skip dead uses. 1148 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1149 << "used by: " << *UI->U->getUser() << "\n"; 1150 if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { 1151 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1152 bool IsDest; 1153 if (!MTO.IsSplittable) 1154 IsDest = UI->BeginOffset == MTO.DestBegin; 1155 else 1156 IsDest = MTO.DestBegin != 0u; 1157 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1158 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1159 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1160 } 1161 } 1162} 1163 1164void AllocaPartitioning::print(raw_ostream &OS) const { 1165 if (PointerEscapingInstr) { 1166 OS << "No partitioning for alloca: " << AI << "\n" 1167 << " A pointer to this alloca escaped by:\n" 1168 << " " << *PointerEscapingInstr << "\n"; 1169 return; 1170 } 1171 1172 OS << "Partitioning of alloca: " << AI << "\n"; 1173 unsigned Num = 0; 1174 for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { 1175 print(OS, I); 1176 printUsers(OS, I); 1177 } 1178} 1179 1180void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1181void AllocaPartitioning::dump() const { print(dbgs()); } 1182 1183#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1184 1185 1186namespace { 1187/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1188/// 1189/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1190/// the loads and stores of an alloca instruction, as well as updating its 1191/// debug information. This is used when a domtree is unavailable and thus 1192/// mem2reg in its full form can't be used to handle promotion of allocas to 1193/// scalar values. 1194class AllocaPromoter : public LoadAndStorePromoter { 1195 AllocaInst &AI; 1196 DIBuilder &DIB; 1197 1198 SmallVector<DbgDeclareInst *, 4> DDIs; 1199 SmallVector<DbgValueInst *, 4> DVIs; 1200 1201public: 1202 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1203 AllocaInst &AI, DIBuilder &DIB) 1204 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1205 1206 void run(const SmallVectorImpl<Instruction*> &Insts) { 1207 // Remember which alloca we're promoting (for isInstInList). 1208 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1209 for (Value::use_iterator UI = DebugNode->use_begin(), 1210 UE = DebugNode->use_end(); 1211 UI != UE; ++UI) 1212 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1213 DDIs.push_back(DDI); 1214 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1215 DVIs.push_back(DVI); 1216 } 1217 1218 LoadAndStorePromoter::run(Insts); 1219 AI.eraseFromParent(); 1220 while (!DDIs.empty()) 1221 DDIs.pop_back_val()->eraseFromParent(); 1222 while (!DVIs.empty()) 1223 DVIs.pop_back_val()->eraseFromParent(); 1224 } 1225 1226 virtual bool isInstInList(Instruction *I, 1227 const SmallVectorImpl<Instruction*> &Insts) const { 1228 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1229 return LI->getOperand(0) == &AI; 1230 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1231 } 1232 1233 virtual void updateDebugInfo(Instruction *Inst) const { 1234 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1235 E = DDIs.end(); I != E; ++I) { 1236 DbgDeclareInst *DDI = *I; 1237 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1238 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1239 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1240 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1241 } 1242 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1243 E = DVIs.end(); I != E; ++I) { 1244 DbgValueInst *DVI = *I; 1245 Value *Arg = NULL; 1246 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1247 // If an argument is zero extended then use argument directly. The ZExt 1248 // may be zapped by an optimization pass in future. 1249 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1250 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1251 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1252 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1253 if (!Arg) 1254 Arg = SI->getOperand(0); 1255 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1256 Arg = LI->getOperand(0); 1257 } else { 1258 continue; 1259 } 1260 Instruction *DbgVal = 1261 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1262 Inst); 1263 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1264 } 1265 } 1266}; 1267} // end anon namespace 1268 1269 1270namespace { 1271/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1272/// 1273/// This pass takes allocations which can be completely analyzed (that is, they 1274/// don't escape) and tries to turn them into scalar SSA values. There are 1275/// a few steps to this process. 1276/// 1277/// 1) It takes allocations of aggregates and analyzes the ways in which they 1278/// are used to try to split them into smaller allocations, ideally of 1279/// a single scalar data type. It will split up memcpy and memset accesses 1280/// as necessary and try to isolate invidual scalar accesses. 1281/// 2) It will transform accesses into forms which are suitable for SSA value 1282/// promotion. This can be replacing a memset with a scalar store of an 1283/// integer value, or it can involve speculating operations on a PHI or 1284/// select to be a PHI or select of the results. 1285/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1286/// onto insert and extract operations on a vector value, and convert them to 1287/// this form. By doing so, it will enable promotion of vector aggregates to 1288/// SSA vector values. 1289class SROA : public FunctionPass { 1290 const bool RequiresDomTree; 1291 1292 LLVMContext *C; 1293 const DataLayout *TD; 1294 DominatorTree *DT; 1295 1296 /// \brief Worklist of alloca instructions to simplify. 1297 /// 1298 /// Each alloca in the function is added to this. Each new alloca formed gets 1299 /// added to it as well to recursively simplify unless that alloca can be 1300 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1301 /// the one being actively rewritten, we add it back onto the list if not 1302 /// already present to ensure it is re-visited. 1303 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1304 1305 /// \brief A collection of instructions to delete. 1306 /// We try to batch deletions to simplify code and make things a bit more 1307 /// efficient. 1308 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 1309 1310 /// \brief Post-promotion worklist. 1311 /// 1312 /// Sometimes we discover an alloca which has a high probability of becoming 1313 /// viable for SROA after a round of promotion takes place. In those cases, 1314 /// the alloca is enqueued here for re-processing. 1315 /// 1316 /// Note that we have to be very careful to clear allocas out of this list in 1317 /// the event they are deleted. 1318 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1319 1320 /// \brief A collection of alloca instructions we can directly promote. 1321 std::vector<AllocaInst *> PromotableAllocas; 1322 1323public: 1324 SROA(bool RequiresDomTree = true) 1325 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1326 C(0), TD(0), DT(0) { 1327 initializeSROAPass(*PassRegistry::getPassRegistry()); 1328 } 1329 bool runOnFunction(Function &F); 1330 void getAnalysisUsage(AnalysisUsage &AU) const; 1331 1332 const char *getPassName() const { return "SROA"; } 1333 static char ID; 1334 1335private: 1336 friend class PHIOrSelectSpeculator; 1337 friend class AllocaPartitionRewriter; 1338 friend class AllocaPartitionVectorRewriter; 1339 1340 bool rewriteAllocaPartition(AllocaInst &AI, 1341 AllocaPartitioning &P, 1342 AllocaPartitioning::iterator PI); 1343 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1344 bool runOnAlloca(AllocaInst &AI); 1345 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1346 bool promoteAllocas(Function &F); 1347}; 1348} 1349 1350char SROA::ID = 0; 1351 1352FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1353 return new SROA(RequiresDomTree); 1354} 1355 1356INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1357 false, false) 1358INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1359INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1360 false, false) 1361 1362namespace { 1363/// \brief Visitor to speculate PHIs and Selects where possible. 1364class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1365 // Befriend the base class so it can delegate to private visit methods. 1366 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1367 1368 const DataLayout &TD; 1369 AllocaPartitioning &P; 1370 SROA &Pass; 1371 1372public: 1373 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1374 : TD(TD), P(P), Pass(Pass) {} 1375 1376 /// \brief Visit the users of an alloca partition and rewrite them. 1377 void visitUsers(AllocaPartitioning::const_iterator PI) { 1378 // Note that we need to use an index here as the underlying vector of uses 1379 // may be grown during speculation. However, we never need to re-visit the 1380 // new uses, and so we can use the initial size bound. 1381 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1382 const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); 1383 if (!PU.U) 1384 continue; // Skip dead use. 1385 1386 visit(cast<Instruction>(PU.U->getUser())); 1387 } 1388 } 1389 1390private: 1391 // By default, skip this instruction. 1392 void visitInstruction(Instruction &I) {} 1393 1394 /// PHI instructions that use an alloca and are subsequently loaded can be 1395 /// rewritten to load both input pointers in the pred blocks and then PHI the 1396 /// results, allowing the load of the alloca to be promoted. 1397 /// From this: 1398 /// %P2 = phi [i32* %Alloca, i32* %Other] 1399 /// %V = load i32* %P2 1400 /// to: 1401 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1402 /// ... 1403 /// %V2 = load i32* %Other 1404 /// ... 1405 /// %V = phi [i32 %V1, i32 %V2] 1406 /// 1407 /// We can do this to a select if its only uses are loads and if the operands 1408 /// to the select can be loaded unconditionally. 1409 /// 1410 /// FIXME: This should be hoisted into a generic utility, likely in 1411 /// Transforms/Util/Local.h 1412 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1413 // For now, we can only do this promotion if the load is in the same block 1414 // as the PHI, and if there are no stores between the phi and load. 1415 // TODO: Allow recursive phi users. 1416 // TODO: Allow stores. 1417 BasicBlock *BB = PN.getParent(); 1418 unsigned MaxAlign = 0; 1419 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1420 UI != UE; ++UI) { 1421 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1422 if (LI == 0 || !LI->isSimple()) return false; 1423 1424 // For now we only allow loads in the same block as the PHI. This is 1425 // a common case that happens when instcombine merges two loads through 1426 // a PHI. 1427 if (LI->getParent() != BB) return false; 1428 1429 // Ensure that there are no instructions between the PHI and the load that 1430 // could store. 1431 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1432 if (BBI->mayWriteToMemory()) 1433 return false; 1434 1435 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1436 Loads.push_back(LI); 1437 } 1438 1439 // We can only transform this if it is safe to push the loads into the 1440 // predecessor blocks. The only thing to watch out for is that we can't put 1441 // a possibly trapping load in the predecessor if it is a critical edge. 1442 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; 1443 ++Idx) { 1444 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1445 Value *InVal = PN.getIncomingValue(Idx); 1446 1447 // If the value is produced by the terminator of the predecessor (an 1448 // invoke) or it has side-effects, there is no valid place to put a load 1449 // in the predecessor. 1450 if (TI == InVal || TI->mayHaveSideEffects()) 1451 return false; 1452 1453 // If the predecessor has a single successor, then the edge isn't 1454 // critical. 1455 if (TI->getNumSuccessors() == 1) 1456 continue; 1457 1458 // If this pointer is always safe to load, or if we can prove that there 1459 // is already a load in the block, then we can move the load to the pred 1460 // block. 1461 if (InVal->isDereferenceablePointer() || 1462 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1463 continue; 1464 1465 return false; 1466 } 1467 1468 return true; 1469 } 1470 1471 void visitPHINode(PHINode &PN) { 1472 DEBUG(dbgs() << " original: " << PN << "\n"); 1473 1474 SmallVector<LoadInst *, 4> Loads; 1475 if (!isSafePHIToSpeculate(PN, Loads)) 1476 return; 1477 1478 assert(!Loads.empty()); 1479 1480 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1481 IRBuilder<> PHIBuilder(&PN); 1482 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1483 PN.getName() + ".sroa.speculated"); 1484 1485 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1486 // matter which one we get and if any differ, it doesn't matter. 1487 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1488 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1489 unsigned Align = SomeLoad->getAlignment(); 1490 1491 // Rewrite all loads of the PN to use the new PHI. 1492 do { 1493 LoadInst *LI = Loads.pop_back_val(); 1494 LI->replaceAllUsesWith(NewPN); 1495 Pass.DeadInsts.insert(LI); 1496 } while (!Loads.empty()); 1497 1498 // Inject loads into all of the pred blocks. 1499 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1500 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1501 TerminatorInst *TI = Pred->getTerminator(); 1502 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1503 Value *InVal = PN.getIncomingValue(Idx); 1504 IRBuilder<> PredBuilder(TI); 1505 1506 LoadInst *Load 1507 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1508 Pred->getName())); 1509 ++NumLoadsSpeculated; 1510 Load->setAlignment(Align); 1511 if (TBAATag) 1512 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1513 NewPN->addIncoming(Load, Pred); 1514 1515 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1516 if (!Ptr) 1517 // No uses to rewrite. 1518 continue; 1519 1520 // Try to lookup and rewrite any partition uses corresponding to this phi 1521 // input. 1522 AllocaPartitioning::iterator PI 1523 = P.findPartitionForPHIOrSelectOperand(InUse); 1524 if (PI == P.end()) 1525 continue; 1526 1527 // Replace the Use in the PartitionUse for this operand with the Use 1528 // inside the load. 1529 AllocaPartitioning::use_iterator UI 1530 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1531 assert(isa<PHINode>(*UI->U->getUser())); 1532 UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); 1533 } 1534 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1535 } 1536 1537 /// Select instructions that use an alloca and are subsequently loaded can be 1538 /// rewritten to load both input pointers and then select between the result, 1539 /// allowing the load of the alloca to be promoted. 1540 /// From this: 1541 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1542 /// %V = load i32* %P2 1543 /// to: 1544 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1545 /// %V2 = load i32* %Other 1546 /// %V = select i1 %cond, i32 %V1, i32 %V2 1547 /// 1548 /// We can do this to a select if its only uses are loads and if the operand 1549 /// to the select can be loaded unconditionally. 1550 bool isSafeSelectToSpeculate(SelectInst &SI, 1551 SmallVectorImpl<LoadInst *> &Loads) { 1552 Value *TValue = SI.getTrueValue(); 1553 Value *FValue = SI.getFalseValue(); 1554 bool TDerefable = TValue->isDereferenceablePointer(); 1555 bool FDerefable = FValue->isDereferenceablePointer(); 1556 1557 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1558 UI != UE; ++UI) { 1559 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1560 if (LI == 0 || !LI->isSimple()) return false; 1561 1562 // Both operands to the select need to be dereferencable, either 1563 // absolutely (e.g. allocas) or at this point because we can see other 1564 // accesses to it. 1565 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1566 LI->getAlignment(), &TD)) 1567 return false; 1568 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1569 LI->getAlignment(), &TD)) 1570 return false; 1571 Loads.push_back(LI); 1572 } 1573 1574 return true; 1575 } 1576 1577 void visitSelectInst(SelectInst &SI) { 1578 DEBUG(dbgs() << " original: " << SI << "\n"); 1579 IRBuilder<> IRB(&SI); 1580 1581 // If the select isn't safe to speculate, just use simple logic to emit it. 1582 SmallVector<LoadInst *, 4> Loads; 1583 if (!isSafeSelectToSpeculate(SI, Loads)) 1584 return; 1585 1586 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1587 AllocaPartitioning::iterator PIs[2]; 1588 AllocaPartitioning::PartitionUse PUs[2]; 1589 for (unsigned i = 0, e = 2; i != e; ++i) { 1590 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1591 if (PIs[i] != P.end()) { 1592 // If the pointer is within the partitioning, remove the select from 1593 // its uses. We'll add in the new loads below. 1594 AllocaPartitioning::use_iterator UI 1595 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1596 PUs[i] = *UI; 1597 // Clear out the use here so that the offsets into the use list remain 1598 // stable but this use is ignored when rewriting. 1599 UI->U = 0; 1600 } 1601 } 1602 1603 Value *TV = SI.getTrueValue(); 1604 Value *FV = SI.getFalseValue(); 1605 // Replace the loads of the select with a select of two loads. 1606 while (!Loads.empty()) { 1607 LoadInst *LI = Loads.pop_back_val(); 1608 1609 IRB.SetInsertPoint(LI); 1610 LoadInst *TL = 1611 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1612 LoadInst *FL = 1613 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1614 NumLoadsSpeculated += 2; 1615 1616 // Transfer alignment and TBAA info if present. 1617 TL->setAlignment(LI->getAlignment()); 1618 FL->setAlignment(LI->getAlignment()); 1619 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1620 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1621 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1622 } 1623 1624 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1625 LI->getName() + ".sroa.speculated"); 1626 1627 LoadInst *Loads[2] = { TL, FL }; 1628 for (unsigned i = 0, e = 2; i != e; ++i) { 1629 if (PIs[i] != P.end()) { 1630 Use *LoadUse = &Loads[i]->getOperandUse(0); 1631 assert(PUs[i].U->get() == LoadUse->get()); 1632 PUs[i].U = LoadUse; 1633 P.use_push_back(PIs[i], PUs[i]); 1634 } 1635 } 1636 1637 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1638 LI->replaceAllUsesWith(V); 1639 Pass.DeadInsts.insert(LI); 1640 } 1641 } 1642}; 1643} 1644 1645/// \brief Build a GEP out of a base pointer and indices. 1646/// 1647/// This will return the BasePtr if that is valid, or build a new GEP 1648/// instruction using the IRBuilder if GEP-ing is needed. 1649static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1650 SmallVectorImpl<Value *> &Indices, 1651 const Twine &Prefix) { 1652 if (Indices.empty()) 1653 return BasePtr; 1654 1655 // A single zero index is a no-op, so check for this and avoid building a GEP 1656 // in that case. 1657 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1658 return BasePtr; 1659 1660 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1661} 1662 1663/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1664/// TargetTy without changing the offset of the pointer. 1665/// 1666/// This routine assumes we've already established a properly offset GEP with 1667/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1668/// zero-indices down through type layers until we find one the same as 1669/// TargetTy. If we can't find one with the same type, we at least try to use 1670/// one with the same size. If none of that works, we just produce the GEP as 1671/// indicated by Indices to have the correct offset. 1672static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, 1673 Value *BasePtr, Type *Ty, Type *TargetTy, 1674 SmallVectorImpl<Value *> &Indices, 1675 const Twine &Prefix) { 1676 if (Ty == TargetTy) 1677 return buildGEP(IRB, BasePtr, Indices, Prefix); 1678 1679 // See if we can descend into a struct and locate a field with the correct 1680 // type. 1681 unsigned NumLayers = 0; 1682 Type *ElementTy = Ty; 1683 do { 1684 if (ElementTy->isPointerTy()) 1685 break; 1686 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1687 ElementTy = SeqTy->getElementType(); 1688 // Note that we use the default address space as this index is over an 1689 // array or a vector, not a pointer. 1690 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); 1691 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1692 if (STy->element_begin() == STy->element_end()) 1693 break; // Nothing left to descend into. 1694 ElementTy = *STy->element_begin(); 1695 Indices.push_back(IRB.getInt32(0)); 1696 } else { 1697 break; 1698 } 1699 ++NumLayers; 1700 } while (ElementTy != TargetTy); 1701 if (ElementTy != TargetTy) 1702 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1703 1704 return buildGEP(IRB, BasePtr, Indices, Prefix); 1705} 1706 1707/// \brief Recursively compute indices for a natural GEP. 1708/// 1709/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1710/// element types adding appropriate indices for the GEP. 1711static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, 1712 Value *Ptr, Type *Ty, APInt &Offset, 1713 Type *TargetTy, 1714 SmallVectorImpl<Value *> &Indices, 1715 const Twine &Prefix) { 1716 if (Offset == 0) 1717 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1718 1719 // We can't recurse through pointer types. 1720 if (Ty->isPointerTy()) 1721 return 0; 1722 1723 // We try to analyze GEPs over vectors here, but note that these GEPs are 1724 // extremely poorly defined currently. The long-term goal is to remove GEPing 1725 // over a vector from the IR completely. 1726 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1727 unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); 1728 if (ElementSizeInBits % 8) 1729 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1730 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1731 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1732 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1733 return 0; 1734 Offset -= NumSkippedElements * ElementSize; 1735 Indices.push_back(IRB.getInt(NumSkippedElements)); 1736 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1737 Offset, TargetTy, Indices, Prefix); 1738 } 1739 1740 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1741 Type *ElementTy = ArrTy->getElementType(); 1742 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1743 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1744 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1745 return 0; 1746 1747 Offset -= NumSkippedElements * ElementSize; 1748 Indices.push_back(IRB.getInt(NumSkippedElements)); 1749 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1750 Indices, Prefix); 1751 } 1752 1753 StructType *STy = dyn_cast<StructType>(Ty); 1754 if (!STy) 1755 return 0; 1756 1757 const StructLayout *SL = TD.getStructLayout(STy); 1758 uint64_t StructOffset = Offset.getZExtValue(); 1759 if (StructOffset >= SL->getSizeInBytes()) 1760 return 0; 1761 unsigned Index = SL->getElementContainingOffset(StructOffset); 1762 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1763 Type *ElementTy = STy->getElementType(Index); 1764 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1765 return 0; // The offset points into alignment padding. 1766 1767 Indices.push_back(IRB.getInt32(Index)); 1768 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1769 Indices, Prefix); 1770} 1771 1772/// \brief Get a natural GEP from a base pointer to a particular offset and 1773/// resulting in a particular type. 1774/// 1775/// The goal is to produce a "natural" looking GEP that works with the existing 1776/// composite types to arrive at the appropriate offset and element type for 1777/// a pointer. TargetTy is the element type the returned GEP should point-to if 1778/// possible. We recurse by decreasing Offset, adding the appropriate index to 1779/// Indices, and setting Ty to the result subtype. 1780/// 1781/// If no natural GEP can be constructed, this function returns null. 1782static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, 1783 Value *Ptr, APInt Offset, Type *TargetTy, 1784 SmallVectorImpl<Value *> &Indices, 1785 const Twine &Prefix) { 1786 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1787 1788 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1789 // an i8. 1790 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1791 return 0; 1792 1793 Type *ElementTy = Ty->getElementType(); 1794 if (!ElementTy->isSized()) 1795 return 0; // We can't GEP through an unsized element. 1796 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1797 if (ElementSize == 0) 1798 return 0; // Zero-length arrays can't help us build a natural GEP. 1799 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1800 1801 Offset -= NumSkippedElements * ElementSize; 1802 Indices.push_back(IRB.getInt(NumSkippedElements)); 1803 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1804 Indices, Prefix); 1805} 1806 1807/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1808/// resulting pointer has PointerTy. 1809/// 1810/// This tries very hard to compute a "natural" GEP which arrives at the offset 1811/// and produces the pointer type desired. Where it cannot, it will try to use 1812/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1813/// fails, it will try to use an existing i8* and GEP to the byte offset and 1814/// bitcast to the type. 1815/// 1816/// The strategy for finding the more natural GEPs is to peel off layers of the 1817/// pointer, walking back through bit casts and GEPs, searching for a base 1818/// pointer from which we can compute a natural GEP with the desired 1819/// properities. The algorithm tries to fold as many constant indices into 1820/// a single GEP as possible, thus making each GEP more independent of the 1821/// surrounding code. 1822static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, 1823 Value *Ptr, APInt Offset, Type *PointerTy, 1824 const Twine &Prefix) { 1825 // Even though we don't look through PHI nodes, we could be called on an 1826 // instruction in an unreachable block, which may be on a cycle. 1827 SmallPtrSet<Value *, 4> Visited; 1828 Visited.insert(Ptr); 1829 SmallVector<Value *, 4> Indices; 1830 1831 // We may end up computing an offset pointer that has the wrong type. If we 1832 // never are able to compute one directly that has the correct type, we'll 1833 // fall back to it, so keep it around here. 1834 Value *OffsetPtr = 0; 1835 1836 // Remember any i8 pointer we come across to re-use if we need to do a raw 1837 // byte offset. 1838 Value *Int8Ptr = 0; 1839 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1840 1841 Type *TargetTy = PointerTy->getPointerElementType(); 1842 1843 do { 1844 // First fold any existing GEPs into the offset. 1845 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1846 APInt GEPOffset(Offset.getBitWidth(), 0); 1847 if (!GEP->accumulateConstantOffset(TD, GEPOffset)) 1848 break; 1849 Offset += GEPOffset; 1850 Ptr = GEP->getPointerOperand(); 1851 if (!Visited.insert(Ptr)) 1852 break; 1853 } 1854 1855 // See if we can perform a natural GEP here. 1856 Indices.clear(); 1857 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1858 Indices, Prefix)) { 1859 if (P->getType() == PointerTy) { 1860 // Zap any offset pointer that we ended up computing in previous rounds. 1861 if (OffsetPtr && OffsetPtr->use_empty()) 1862 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1863 I->eraseFromParent(); 1864 return P; 1865 } 1866 if (!OffsetPtr) { 1867 OffsetPtr = P; 1868 } 1869 } 1870 1871 // Stash this pointer if we've found an i8*. 1872 if (Ptr->getType()->isIntegerTy(8)) { 1873 Int8Ptr = Ptr; 1874 Int8PtrOffset = Offset; 1875 } 1876 1877 // Peel off a layer of the pointer and update the offset appropriately. 1878 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1879 Ptr = cast<Operator>(Ptr)->getOperand(0); 1880 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1881 if (GA->mayBeOverridden()) 1882 break; 1883 Ptr = GA->getAliasee(); 1884 } else { 1885 break; 1886 } 1887 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1888 } while (Visited.insert(Ptr)); 1889 1890 if (!OffsetPtr) { 1891 if (!Int8Ptr) { 1892 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1893 Prefix + ".raw_cast"); 1894 Int8PtrOffset = Offset; 1895 } 1896 1897 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1898 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1899 Prefix + ".raw_idx"); 1900 } 1901 Ptr = OffsetPtr; 1902 1903 // On the off chance we were targeting i8*, guard the bitcast here. 1904 if (Ptr->getType() != PointerTy) 1905 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 1906 1907 return Ptr; 1908} 1909 1910/// \brief Test whether we can convert a value from the old to the new type. 1911/// 1912/// This predicate should be used to guard calls to convertValue in order to 1913/// ensure that we only try to convert viable values. The strategy is that we 1914/// will peel off single element struct and array wrappings to get to an 1915/// underlying value, and convert that value. 1916static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1917 if (OldTy == NewTy) 1918 return true; 1919 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 1920 return false; 1921 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1922 return false; 1923 1924 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1925 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 1926 return true; 1927 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 1928 return true; 1929 return false; 1930 } 1931 1932 return true; 1933} 1934 1935/// \brief Generic routine to convert an SSA value to a value of a different 1936/// type. 1937/// 1938/// This will try various different casting techniques, such as bitcasts, 1939/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1940/// two types for viability with this routine. 1941static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 1942 Type *Ty) { 1943 assert(canConvertValue(DL, V->getType(), Ty) && 1944 "Value not convertable to type"); 1945 if (V->getType() == Ty) 1946 return V; 1947 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 1948 return IRB.CreateIntToPtr(V, Ty); 1949 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 1950 return IRB.CreatePtrToInt(V, Ty); 1951 1952 return IRB.CreateBitCast(V, Ty); 1953} 1954 1955/// \brief Test whether the given alloca partition can be promoted to a vector. 1956/// 1957/// This is a quick test to check whether we can rewrite a particular alloca 1958/// partition (and its newly formed alloca) into a vector alloca with only 1959/// whole-vector loads and stores such that it could be promoted to a vector 1960/// SSA value. We only can ensure this for a limited set of operations, and we 1961/// don't want to do the rewrites unless we are confident that the result will 1962/// be promotable, so we have an early test here. 1963static bool isVectorPromotionViable(const DataLayout &TD, 1964 Type *AllocaTy, 1965 AllocaPartitioning &P, 1966 uint64_t PartitionBeginOffset, 1967 uint64_t PartitionEndOffset, 1968 AllocaPartitioning::const_use_iterator I, 1969 AllocaPartitioning::const_use_iterator E) { 1970 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 1971 if (!Ty) 1972 return false; 1973 1974 uint64_t VecSize = TD.getTypeSizeInBits(Ty); 1975 uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); 1976 1977 // While the definition of LLVM vectors is bitpacked, we don't support sizes 1978 // that aren't byte sized. 1979 if (ElementSize % 8) 1980 return false; 1981 assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); 1982 VecSize /= 8; 1983 ElementSize /= 8; 1984 1985 for (; I != E; ++I) { 1986 if (!I->U) 1987 continue; // Skip dead use. 1988 1989 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 1990 uint64_t BeginIndex = BeginOffset / ElementSize; 1991 if (BeginIndex * ElementSize != BeginOffset || 1992 BeginIndex >= Ty->getNumElements()) 1993 return false; 1994 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 1995 uint64_t EndIndex = EndOffset / ElementSize; 1996 if (EndIndex * ElementSize != EndOffset || 1997 EndIndex > Ty->getNumElements()) 1998 return false; 1999 2000 assert(EndIndex > BeginIndex && "Empty vector!"); 2001 uint64_t NumElements = EndIndex - BeginIndex; 2002 Type *PartitionTy 2003 = (NumElements == 1) ? Ty->getElementType() 2004 : VectorType::get(Ty->getElementType(), NumElements); 2005 2006 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2007 if (MI->isVolatile()) 2008 return false; 2009 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2010 const AllocaPartitioning::MemTransferOffsets &MTO 2011 = P.getMemTransferOffsets(*MTI); 2012 if (!MTO.IsSplittable) 2013 return false; 2014 } 2015 } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { 2016 // Disable vector promotion when there are loads or stores of an FCA. 2017 return false; 2018 } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2019 if (LI->isVolatile()) 2020 return false; 2021 if (!canConvertValue(TD, PartitionTy, LI->getType())) 2022 return false; 2023 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2024 if (SI->isVolatile()) 2025 return false; 2026 if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) 2027 return false; 2028 } else { 2029 return false; 2030 } 2031 } 2032 return true; 2033} 2034 2035/// \brief Test whether the given alloca partition's integer operations can be 2036/// widened to promotable ones. 2037/// 2038/// This is a quick test to check whether we can rewrite the integer loads and 2039/// stores to a particular alloca into wider loads and stores and be able to 2040/// promote the resulting alloca. 2041static bool isIntegerWideningViable(const DataLayout &TD, 2042 Type *AllocaTy, 2043 uint64_t AllocBeginOffset, 2044 AllocaPartitioning &P, 2045 AllocaPartitioning::const_use_iterator I, 2046 AllocaPartitioning::const_use_iterator E) { 2047 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2048 // Don't create integer types larger than the maximum bitwidth. 2049 if (SizeInBits > IntegerType::MAX_INT_BITS) 2050 return false; 2051 2052 // Don't try to handle allocas with bit-padding. 2053 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2054 return false; 2055 2056 // We need to ensure that an integer type with the appropriate bitwidth can 2057 // be converted to the alloca type, whatever that is. We don't want to force 2058 // the alloca itself to have an integer type if there is a more suitable one. 2059 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2060 if (!canConvertValue(TD, AllocaTy, IntTy) || 2061 !canConvertValue(TD, IntTy, AllocaTy)) 2062 return false; 2063 2064 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2065 2066 // Check the uses to ensure the uses are (likely) promoteable integer uses. 2067 // Also ensure that the alloca has a covering load or store. We don't want 2068 // to widen the integer operotains only to fail to promote due to some other 2069 // unsplittable entry (which we may make splittable later). 2070 bool WholeAllocaOp = false; 2071 for (; I != E; ++I) { 2072 if (!I->U) 2073 continue; // Skip dead use. 2074 2075 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2076 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2077 2078 // We can't reasonably handle cases where the load or store extends past 2079 // the end of the aloca's type and into its padding. 2080 if (RelEnd > Size) 2081 return false; 2082 2083 if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2084 if (LI->isVolatile()) 2085 return false; 2086 if (RelBegin == 0 && RelEnd == Size) 2087 WholeAllocaOp = true; 2088 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2089 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2090 return false; 2091 continue; 2092 } 2093 // Non-integer loads need to be convertible from the alloca type so that 2094 // they are promotable. 2095 if (RelBegin != 0 || RelEnd != Size || 2096 !canConvertValue(TD, AllocaTy, LI->getType())) 2097 return false; 2098 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2099 Type *ValueTy = SI->getValueOperand()->getType(); 2100 if (SI->isVolatile()) 2101 return false; 2102 if (RelBegin == 0 && RelEnd == Size) 2103 WholeAllocaOp = true; 2104 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2105 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2106 return false; 2107 continue; 2108 } 2109 // Non-integer stores need to be convertible to the alloca type so that 2110 // they are promotable. 2111 if (RelBegin != 0 || RelEnd != Size || 2112 !canConvertValue(TD, ValueTy, AllocaTy)) 2113 return false; 2114 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2115 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 2116 return false; 2117 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2118 const AllocaPartitioning::MemTransferOffsets &MTO 2119 = P.getMemTransferOffsets(*MTI); 2120 if (!MTO.IsSplittable) 2121 return false; 2122 } 2123 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { 2124 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2125 II->getIntrinsicID() != Intrinsic::lifetime_end) 2126 return false; 2127 } else { 2128 return false; 2129 } 2130 } 2131 return WholeAllocaOp; 2132} 2133 2134static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2135 IntegerType *Ty, uint64_t Offset, 2136 const Twine &Name) { 2137 DEBUG(dbgs() << " start: " << *V << "\n"); 2138 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2139 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2140 "Element extends past full value"); 2141 uint64_t ShAmt = 8*Offset; 2142 if (DL.isBigEndian()) 2143 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2144 if (ShAmt) { 2145 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2146 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2147 } 2148 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2149 "Cannot extract to a larger integer!"); 2150 if (Ty != IntTy) { 2151 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2152 DEBUG(dbgs() << " trunced: " << *V << "\n"); 2153 } 2154 return V; 2155} 2156 2157static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, 2158 Value *V, uint64_t Offset, const Twine &Name) { 2159 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2160 IntegerType *Ty = cast<IntegerType>(V->getType()); 2161 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2162 "Cannot insert a larger integer!"); 2163 DEBUG(dbgs() << " start: " << *V << "\n"); 2164 if (Ty != IntTy) { 2165 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2166 DEBUG(dbgs() << " extended: " << *V << "\n"); 2167 } 2168 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2169 "Element store outside of alloca store"); 2170 uint64_t ShAmt = 8*Offset; 2171 if (DL.isBigEndian()) 2172 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2173 if (ShAmt) { 2174 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2175 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2176 } 2177 2178 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2179 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2180 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2181 DEBUG(dbgs() << " masked: " << *Old << "\n"); 2182 V = IRB.CreateOr(Old, V, Name + ".insert"); 2183 DEBUG(dbgs() << " inserted: " << *V << "\n"); 2184 } 2185 return V; 2186} 2187 2188static Value *extractVector(IRBuilder<> &IRB, Value *V, 2189 unsigned BeginIndex, unsigned EndIndex, 2190 const Twine &Name) { 2191 VectorType *VecTy = cast<VectorType>(V->getType()); 2192 unsigned NumElements = EndIndex - BeginIndex; 2193 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2194 2195 if (NumElements == VecTy->getNumElements()) 2196 return V; 2197 2198 if (NumElements == 1) { 2199 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2200 Name + ".extract"); 2201 DEBUG(dbgs() << " extract: " << *V << "\n"); 2202 return V; 2203 } 2204 2205 SmallVector<Constant*, 8> Mask; 2206 Mask.reserve(NumElements); 2207 for (unsigned i = BeginIndex; i != EndIndex; ++i) 2208 Mask.push_back(IRB.getInt32(i)); 2209 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2210 ConstantVector::get(Mask), 2211 Name + ".extract"); 2212 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2213 return V; 2214} 2215 2216static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V, 2217 unsigned BeginIndex, const Twine &Name) { 2218 VectorType *VecTy = cast<VectorType>(Old->getType()); 2219 assert(VecTy && "Can only insert a vector into a vector"); 2220 2221 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 2222 if (!Ty) { 2223 // Single element to insert. 2224 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 2225 Name + ".insert"); 2226 DEBUG(dbgs() << " insert: " << *V << "\n"); 2227 return V; 2228 } 2229 2230 assert(Ty->getNumElements() <= VecTy->getNumElements() && 2231 "Too many elements!"); 2232 if (Ty->getNumElements() == VecTy->getNumElements()) { 2233 assert(V->getType() == VecTy && "Vector type mismatch"); 2234 return V; 2235 } 2236 unsigned EndIndex = BeginIndex + Ty->getNumElements(); 2237 2238 // When inserting a smaller vector into the larger to store, we first 2239 // use a shuffle vector to widen it with undef elements, and then 2240 // a second shuffle vector to select between the loaded vector and the 2241 // incoming vector. 2242 SmallVector<Constant*, 8> Mask; 2243 Mask.reserve(VecTy->getNumElements()); 2244 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2245 if (i >= BeginIndex && i < EndIndex) 2246 Mask.push_back(IRB.getInt32(i - BeginIndex)); 2247 else 2248 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 2249 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2250 ConstantVector::get(Mask), 2251 Name + ".expand"); 2252 DEBUG(dbgs() << " shuffle1: " << *V << "\n"); 2253 2254 Mask.clear(); 2255 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2256 if (i >= BeginIndex && i < EndIndex) 2257 Mask.push_back(IRB.getInt32(i)); 2258 else 2259 Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); 2260 V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask), 2261 Name + "insert"); 2262 DEBUG(dbgs() << " shuffle2: " << *V << "\n"); 2263 return V; 2264} 2265 2266namespace { 2267/// \brief Visitor to rewrite instructions using a partition of an alloca to 2268/// use a new alloca. 2269/// 2270/// Also implements the rewriting to vector-based accesses when the partition 2271/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2272/// lives here. 2273class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2274 bool> { 2275 // Befriend the base class so it can delegate to private visit methods. 2276 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2277 2278 const DataLayout &TD; 2279 AllocaPartitioning &P; 2280 SROA &Pass; 2281 AllocaInst &OldAI, &NewAI; 2282 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2283 Type *NewAllocaTy; 2284 2285 // If we are rewriting an alloca partition which can be written as pure 2286 // vector operations, we stash extra information here. When VecTy is 2287 // non-null, we have some strict guarantees about the rewriten alloca: 2288 // - The new alloca is exactly the size of the vector type here. 2289 // - The accesses all either map to the entire vector or to a single 2290 // element. 2291 // - The set of accessing instructions is only one of those handled above 2292 // in isVectorPromotionViable. Generally these are the same access kinds 2293 // which are promotable via mem2reg. 2294 VectorType *VecTy; 2295 Type *ElementTy; 2296 uint64_t ElementSize; 2297 2298 // This is a convenience and flag variable that will be null unless the new 2299 // alloca's integer operations should be widened to this integer type due to 2300 // passing isIntegerWideningViable above. If it is non-null, the desired 2301 // integer type will be stored here for easy access during rewriting. 2302 IntegerType *IntTy; 2303 2304 // The offset of the partition user currently being rewritten. 2305 uint64_t BeginOffset, EndOffset; 2306 Use *OldUse; 2307 Instruction *OldPtr; 2308 2309 // The name prefix to use when rewriting instructions for this alloca. 2310 std::string NamePrefix; 2311 2312public: 2313 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2314 AllocaPartitioning::iterator PI, 2315 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2316 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2317 : TD(TD), P(P), Pass(Pass), 2318 OldAI(OldAI), NewAI(NewAI), 2319 NewAllocaBeginOffset(NewBeginOffset), 2320 NewAllocaEndOffset(NewEndOffset), 2321 NewAllocaTy(NewAI.getAllocatedType()), 2322 VecTy(), ElementTy(), ElementSize(), IntTy(), 2323 BeginOffset(), EndOffset() { 2324 } 2325 2326 /// \brief Visit the users of the alloca partition and rewrite them. 2327 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2328 AllocaPartitioning::const_use_iterator E) { 2329 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2330 NewAllocaBeginOffset, NewAllocaEndOffset, 2331 I, E)) { 2332 ++NumVectorized; 2333 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2334 ElementTy = VecTy->getElementType(); 2335 assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && 2336 "Only multiple-of-8 sized vector elements are viable"); 2337 ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; 2338 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2339 NewAllocaBeginOffset, P, I, E)) { 2340 IntTy = Type::getIntNTy(NewAI.getContext(), 2341 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2342 } 2343 bool CanSROA = true; 2344 for (; I != E; ++I) { 2345 if (!I->U) 2346 continue; // Skip dead uses. 2347 BeginOffset = I->BeginOffset; 2348 EndOffset = I->EndOffset; 2349 OldUse = I->U; 2350 OldPtr = cast<Instruction>(I->U->get()); 2351 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2352 CanSROA &= visit(cast<Instruction>(I->U->getUser())); 2353 } 2354 if (VecTy) { 2355 assert(CanSROA); 2356 VecTy = 0; 2357 ElementTy = 0; 2358 ElementSize = 0; 2359 } 2360 if (IntTy) { 2361 assert(CanSROA); 2362 IntTy = 0; 2363 } 2364 return CanSROA; 2365 } 2366 2367private: 2368 // Every instruction which can end up as a user must have a rewrite rule. 2369 bool visitInstruction(Instruction &I) { 2370 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2371 llvm_unreachable("No rewrite rule for this instruction!"); 2372 } 2373 2374 Twine getName(const Twine &Suffix) { 2375 return NamePrefix + Suffix; 2376 } 2377 2378 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 2379 assert(BeginOffset >= NewAllocaBeginOffset); 2380 APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); 2381 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2382 } 2383 2384 /// \brief Compute suitable alignment to access an offset into the new alloca. 2385 unsigned getOffsetAlign(uint64_t Offset) { 2386 unsigned NewAIAlign = NewAI.getAlignment(); 2387 if (!NewAIAlign) 2388 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2389 return MinAlign(NewAIAlign, Offset); 2390 } 2391 2392 /// \brief Compute suitable alignment to access this partition of the new 2393 /// alloca. 2394 unsigned getPartitionAlign() { 2395 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2396 } 2397 2398 /// \brief Compute suitable alignment to access a type at an offset of the 2399 /// new alloca. 2400 /// 2401 /// \returns zero if the type's ABI alignment is a suitable alignment, 2402 /// otherwise returns the maximal suitable alignment. 2403 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2404 unsigned Align = getOffsetAlign(Offset); 2405 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2406 } 2407 2408 /// \brief Compute suitable alignment to access a type at the beginning of 2409 /// this partition of the new alloca. 2410 /// 2411 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2412 unsigned getPartitionTypeAlign(Type *Ty) { 2413 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2414 } 2415 2416 unsigned getIndex(uint64_t Offset) { 2417 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2418 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2419 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2420 uint32_t Index = RelOffset / ElementSize; 2421 assert(Index * ElementSize == RelOffset); 2422 return Index; 2423 } 2424 2425 void deleteIfTriviallyDead(Value *V) { 2426 Instruction *I = cast<Instruction>(V); 2427 if (isInstructionTriviallyDead(I)) 2428 Pass.DeadInsts.insert(I); 2429 } 2430 2431 Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) { 2432 unsigned BeginIndex = getIndex(BeginOffset); 2433 unsigned EndIndex = getIndex(EndOffset); 2434 assert(EndIndex > BeginIndex && "Empty vector!"); 2435 2436 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2437 getName(".load")); 2438 return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec")); 2439 } 2440 2441 Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 2442 assert(IntTy && "We cannot insert an integer to the alloca"); 2443 assert(!LI.isVolatile()); 2444 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2445 getName(".load")); 2446 V = convertValue(TD, IRB, V, IntTy); 2447 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2448 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2449 if (Offset > 0 || EndOffset < NewAllocaEndOffset) 2450 V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2451 getName(".extract")); 2452 return V; 2453 } 2454 2455 bool visitLoadInst(LoadInst &LI) { 2456 DEBUG(dbgs() << " original: " << LI << "\n"); 2457 Value *OldOp = LI.getOperand(0); 2458 assert(OldOp == OldPtr); 2459 IRBuilder<> IRB(&LI); 2460 2461 uint64_t Size = EndOffset - BeginOffset; 2462 bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); 2463 2464 // If this memory access can be shown to *statically* extend outside the 2465 // bounds of the original allocation it's behavior is undefined. Rather 2466 // than trying to transform it, just replace it with undef. 2467 // FIXME: We should do something more clever for functions being 2468 // instrumented by asan. 2469 // FIXME: Eventually, once ASan and friends can flush out bugs here, this 2470 // should be transformed to a load of null making it unreachable. 2471 uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); 2472 if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { 2473 LI.replaceAllUsesWith(UndefValue::get(LI.getType())); 2474 Pass.DeadInsts.insert(&LI); 2475 deleteIfTriviallyDead(OldOp); 2476 DEBUG(dbgs() << " to: undef!!\n"); 2477 return true; 2478 } 2479 2480 Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) 2481 : LI.getType(); 2482 bool IsPtrAdjusted = false; 2483 Value *V; 2484 if (VecTy) { 2485 V = rewriteVectorizedLoadInst(IRB); 2486 } else if (IntTy && LI.getType()->isIntegerTy()) { 2487 V = rewriteIntegerLoad(IRB, LI); 2488 } else if (BeginOffset == NewAllocaBeginOffset && 2489 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2490 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2491 LI.isVolatile(), getName(".load")); 2492 } else { 2493 Type *LTy = TargetTy->getPointerTo(); 2494 V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), 2495 getPartitionTypeAlign(TargetTy), 2496 LI.isVolatile(), getName(".load")); 2497 IsPtrAdjusted = true; 2498 } 2499 V = convertValue(TD, IRB, V, TargetTy); 2500 2501 if (IsSplitIntLoad) { 2502 assert(!LI.isVolatile()); 2503 assert(LI.getType()->isIntegerTy() && 2504 "Only integer type loads and stores are split"); 2505 assert(LI.getType()->getIntegerBitWidth() == 2506 TD.getTypeStoreSizeInBits(LI.getType()) && 2507 "Non-byte-multiple bit width"); 2508 assert(LI.getType()->getIntegerBitWidth() == 2509 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2510 "Only alloca-wide loads can be split and recomposed"); 2511 // Move the insertion point just past the load so that we can refer to it. 2512 IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); 2513 // Create a placeholder value with the same type as LI to use as the 2514 // basis for the new value. This allows us to replace the uses of LI with 2515 // the computed value, and then replace the placeholder with LI, leaving 2516 // LI only used for this computation. 2517 Value *Placeholder 2518 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 2519 V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, 2520 getName(".insert")); 2521 LI.replaceAllUsesWith(V); 2522 Placeholder->replaceAllUsesWith(&LI); 2523 delete Placeholder; 2524 } else { 2525 LI.replaceAllUsesWith(V); 2526 } 2527 2528 Pass.DeadInsts.insert(&LI); 2529 deleteIfTriviallyDead(OldOp); 2530 DEBUG(dbgs() << " to: " << *V << "\n"); 2531 return !LI.isVolatile() && !IsPtrAdjusted; 2532 } 2533 2534 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, 2535 StoreInst &SI, Value *OldOp) { 2536 unsigned BeginIndex = getIndex(BeginOffset); 2537 unsigned EndIndex = getIndex(EndOffset); 2538 assert(EndIndex > BeginIndex && "Empty vector!"); 2539 unsigned NumElements = EndIndex - BeginIndex; 2540 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2541 Type *PartitionTy 2542 = (NumElements == 1) ? ElementTy 2543 : VectorType::get(ElementTy, NumElements); 2544 if (V->getType() != PartitionTy) 2545 V = convertValue(TD, IRB, V, PartitionTy); 2546 2547 // Mix in the existing elements. 2548 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2549 getName(".load")); 2550 V = insertVector(IRB, Old, V, BeginIndex, getName(".vec")); 2551 2552 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2553 Pass.DeadInsts.insert(&SI); 2554 2555 (void)Store; 2556 DEBUG(dbgs() << " to: " << *Store << "\n"); 2557 return true; 2558 } 2559 2560 bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { 2561 assert(IntTy && "We cannot extract an integer from the alloca"); 2562 assert(!SI.isVolatile()); 2563 if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2564 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2565 getName(".oldload")); 2566 Old = convertValue(TD, IRB, Old, IntTy); 2567 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2568 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2569 V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, 2570 getName(".insert")); 2571 } 2572 V = convertValue(TD, IRB, V, NewAllocaTy); 2573 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2574 Pass.DeadInsts.insert(&SI); 2575 (void)Store; 2576 DEBUG(dbgs() << " to: " << *Store << "\n"); 2577 return true; 2578 } 2579 2580 bool visitStoreInst(StoreInst &SI) { 2581 DEBUG(dbgs() << " original: " << SI << "\n"); 2582 Value *OldOp = SI.getOperand(1); 2583 assert(OldOp == OldPtr); 2584 IRBuilder<> IRB(&SI); 2585 2586 Value *V = SI.getValueOperand(); 2587 2588 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2589 // alloca that should be re-examined after promoting this alloca. 2590 if (V->getType()->isPointerTy()) 2591 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2592 Pass.PostPromotionWorklist.insert(AI); 2593 2594 uint64_t Size = EndOffset - BeginOffset; 2595 if (Size < TD.getTypeStoreSize(V->getType())) { 2596 assert(!SI.isVolatile()); 2597 assert(V->getType()->isIntegerTy() && 2598 "Only integer type loads and stores are split"); 2599 assert(V->getType()->getIntegerBitWidth() == 2600 TD.getTypeStoreSizeInBits(V->getType()) && 2601 "Non-byte-multiple bit width"); 2602 assert(V->getType()->getIntegerBitWidth() == 2603 TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && 2604 "Only alloca-wide stores can be split and recomposed"); 2605 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); 2606 V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, 2607 getName(".extract")); 2608 } 2609 2610 if (VecTy) 2611 return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); 2612 if (IntTy && V->getType()->isIntegerTy()) 2613 return rewriteIntegerStore(IRB, V, SI); 2614 2615 StoreInst *NewSI; 2616 if (BeginOffset == NewAllocaBeginOffset && 2617 canConvertValue(TD, V->getType(), NewAllocaTy)) { 2618 V = convertValue(TD, IRB, V, NewAllocaTy); 2619 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2620 SI.isVolatile()); 2621 } else { 2622 Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); 2623 NewSI = IRB.CreateAlignedStore(V, NewPtr, 2624 getPartitionTypeAlign(V->getType()), 2625 SI.isVolatile()); 2626 } 2627 (void)NewSI; 2628 Pass.DeadInsts.insert(&SI); 2629 deleteIfTriviallyDead(OldOp); 2630 2631 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2632 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 2633 } 2634 2635 /// \brief Compute an integer value from splatting an i8 across the given 2636 /// number of bytes. 2637 /// 2638 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 2639 /// call this routine. 2640 /// FIXME: Heed the abvice above. 2641 /// 2642 /// \param V The i8 value to splat. 2643 /// \param Size The number of bytes in the output (assuming i8 is one byte) 2644 Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) { 2645 assert(Size > 0 && "Expected a positive number of bytes."); 2646 IntegerType *VTy = cast<IntegerType>(V->getType()); 2647 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 2648 if (Size == 1) 2649 return V; 2650 2651 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2652 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2653 ConstantExpr::getUDiv( 2654 Constant::getAllOnesValue(SplatIntTy), 2655 ConstantExpr::getZExt( 2656 Constant::getAllOnesValue(V->getType()), 2657 SplatIntTy)), 2658 getName(".isplat")); 2659 return V; 2660 } 2661 2662 /// \brief Compute a vector splat for a given element value. 2663 Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) { 2664 assert(NumElements > 0 && "Cannot splat to an empty vector."); 2665 2666 // First insert it into a one-element vector so we can shuffle it. It is 2667 // really silly that LLVM's IR requires this in order to form a splat. 2668 Value *Undef = UndefValue::get(VectorType::get(V->getType(), 1)); 2669 V = IRB.CreateInsertElement(Undef, V, IRB.getInt32(0), 2670 getName(".splatinsert")); 2671 2672 // Shuffle the value across the desired number of elements. 2673 SmallVector<Constant*, 8> Mask(NumElements, IRB.getInt32(0)); 2674 V = IRB.CreateShuffleVector(V, Undef, ConstantVector::get(Mask), 2675 getName(".splat")); 2676 DEBUG(dbgs() << " splat: " << *V << "\n"); 2677 return V; 2678 } 2679 2680 bool visitMemSetInst(MemSetInst &II) { 2681 DEBUG(dbgs() << " original: " << II << "\n"); 2682 IRBuilder<> IRB(&II); 2683 assert(II.getRawDest() == OldPtr); 2684 2685 // If the memset has a variable size, it cannot be split, just adjust the 2686 // pointer to the new alloca. 2687 if (!isa<Constant>(II.getLength())) { 2688 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2689 Type *CstTy = II.getAlignmentCst()->getType(); 2690 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2691 2692 deleteIfTriviallyDead(OldPtr); 2693 return false; 2694 } 2695 2696 // Record this instruction for deletion. 2697 Pass.DeadInsts.insert(&II); 2698 2699 Type *AllocaTy = NewAI.getAllocatedType(); 2700 Type *ScalarTy = AllocaTy->getScalarType(); 2701 2702 // If this doesn't map cleanly onto the alloca type, and that type isn't 2703 // a single value type, just emit a memset. 2704 if (!VecTy && !IntTy && 2705 (BeginOffset != NewAllocaBeginOffset || 2706 EndOffset != NewAllocaEndOffset || 2707 !AllocaTy->isSingleValueType() || 2708 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || 2709 TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { 2710 Type *SizeTy = II.getLength()->getType(); 2711 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2712 CallInst *New 2713 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2714 II.getRawDest()->getType()), 2715 II.getValue(), Size, getPartitionAlign(), 2716 II.isVolatile()); 2717 (void)New; 2718 DEBUG(dbgs() << " to: " << *New << "\n"); 2719 return false; 2720 } 2721 2722 // If we can represent this as a simple value, we have to build the actual 2723 // value to store, which requires expanding the byte present in memset to 2724 // a sensible representation for the alloca type. This is essentially 2725 // splatting the byte to a sufficiently wide integer, splatting it across 2726 // any desired vector width, and bitcasting to the final type. 2727 uint64_t Size = EndOffset - BeginOffset; 2728 Value *V = getIntegerSplat(IRB, II.getValue(), Size); 2729 2730 if (VecTy) { 2731 // If this is a memset of a vectorized alloca, insert it. 2732 assert(ElementTy == ScalarTy); 2733 2734 unsigned BeginIndex = getIndex(BeginOffset); 2735 unsigned EndIndex = getIndex(EndOffset); 2736 assert(EndIndex > BeginIndex && "Empty vector!"); 2737 unsigned NumElements = EndIndex - BeginIndex; 2738 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2739 2740 Value *Splat = getIntegerSplat(IRB, II.getValue(), 2741 TD.getTypeSizeInBits(ElementTy)/8); 2742 Splat = convertValue(TD, IRB, Splat, ElementTy); 2743 if (NumElements > 1) 2744 Splat = getVectorSplat(IRB, Splat, NumElements); 2745 2746 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2747 getName(".oldload")); 2748 V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec")); 2749 } else if (IntTy) { 2750 // If this is a memset on an alloca where we can widen stores, insert the 2751 // set integer. 2752 assert(!II.isVolatile()); 2753 2754 V = getIntegerSplat(IRB, II.getValue(), Size); 2755 2756 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 2757 EndOffset != NewAllocaBeginOffset)) { 2758 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2759 getName(".oldload")); 2760 Old = convertValue(TD, IRB, Old, IntTy); 2761 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2762 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2763 V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); 2764 } else { 2765 assert(V->getType() == IntTy && 2766 "Wrong type for an alloca wide integer!"); 2767 } 2768 V = convertValue(TD, IRB, V, AllocaTy); 2769 } else { 2770 // Established these invariants above. 2771 assert(BeginOffset == NewAllocaBeginOffset); 2772 assert(EndOffset == NewAllocaEndOffset); 2773 2774 V = getIntegerSplat(IRB, II.getValue(), 2775 TD.getTypeSizeInBits(ScalarTy)/8); 2776 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 2777 V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements()); 2778 2779 V = convertValue(TD, IRB, V, AllocaTy); 2780 } 2781 2782 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2783 II.isVolatile()); 2784 (void)New; 2785 DEBUG(dbgs() << " to: " << *New << "\n"); 2786 return !II.isVolatile(); 2787 } 2788 2789 bool visitMemTransferInst(MemTransferInst &II) { 2790 // Rewriting of memory transfer instructions can be a bit tricky. We break 2791 // them into two categories: split intrinsics and unsplit intrinsics. 2792 2793 DEBUG(dbgs() << " original: " << II << "\n"); 2794 IRBuilder<> IRB(&II); 2795 2796 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2797 bool IsDest = II.getRawDest() == OldPtr; 2798 2799 const AllocaPartitioning::MemTransferOffsets &MTO 2800 = P.getMemTransferOffsets(II); 2801 2802 // Compute the relative offset within the transfer. 2803 unsigned IntPtrWidth = TD.getPointerSizeInBits(); 2804 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2805 : MTO.SourceBegin)); 2806 2807 unsigned Align = II.getAlignment(); 2808 if (Align > 1) 2809 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2810 MinAlign(II.getAlignment(), getPartitionAlign())); 2811 2812 // For unsplit intrinsics, we simply modify the source and destination 2813 // pointers in place. This isn't just an optimization, it is a matter of 2814 // correctness. With unsplit intrinsics we may be dealing with transfers 2815 // within a single alloca before SROA ran, or with transfers that have 2816 // a variable length. We may also be dealing with memmove instead of 2817 // memcpy, and so simply updating the pointers is the necessary for us to 2818 // update both source and dest of a single call. 2819 if (!MTO.IsSplittable) { 2820 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2821 if (IsDest) 2822 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2823 else 2824 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2825 2826 Type *CstTy = II.getAlignmentCst()->getType(); 2827 II.setAlignment(ConstantInt::get(CstTy, Align)); 2828 2829 DEBUG(dbgs() << " to: " << II << "\n"); 2830 deleteIfTriviallyDead(OldOp); 2831 return false; 2832 } 2833 // For split transfer intrinsics we have an incredibly useful assurance: 2834 // the source and destination do not reside within the same alloca, and at 2835 // least one of them does not escape. This means that we can replace 2836 // memmove with memcpy, and we don't need to worry about all manner of 2837 // downsides to splitting and transforming the operations. 2838 2839 // If this doesn't map cleanly onto the alloca type, and that type isn't 2840 // a single value type, just emit a memcpy. 2841 bool EmitMemCpy 2842 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2843 EndOffset != NewAllocaEndOffset || 2844 !NewAI.getAllocatedType()->isSingleValueType()); 2845 2846 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2847 // size hasn't been shrunk based on analysis of the viable range, this is 2848 // a no-op. 2849 if (EmitMemCpy && &OldAI == &NewAI) { 2850 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2851 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2852 // Ensure the start lines up. 2853 assert(BeginOffset == OrigBegin); 2854 (void)OrigBegin; 2855 2856 // Rewrite the size as needed. 2857 if (EndOffset != OrigEnd) 2858 II.setLength(ConstantInt::get(II.getLength()->getType(), 2859 EndOffset - BeginOffset)); 2860 return false; 2861 } 2862 // Record this instruction for deletion. 2863 Pass.DeadInsts.insert(&II); 2864 2865 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2866 // alloca that should be re-examined after rewriting this instruction. 2867 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2868 if (AllocaInst *AI 2869 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2870 Pass.Worklist.insert(AI); 2871 2872 if (EmitMemCpy) { 2873 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2874 : II.getRawDest()->getType(); 2875 2876 // Compute the other pointer, folding as much as possible to produce 2877 // a single, simple GEP in most cases. 2878 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2879 getName("." + OtherPtr->getName())); 2880 2881 Value *OurPtr 2882 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2883 : II.getRawSource()->getType()); 2884 Type *SizeTy = II.getLength()->getType(); 2885 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2886 2887 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2888 IsDest ? OtherPtr : OurPtr, 2889 Size, Align, II.isVolatile()); 2890 (void)New; 2891 DEBUG(dbgs() << " to: " << *New << "\n"); 2892 return false; 2893 } 2894 2895 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2896 // is equivalent to 1, but that isn't true if we end up rewriting this as 2897 // a load or store. 2898 if (!Align) 2899 Align = 1; 2900 2901 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2902 EndOffset == NewAllocaEndOffset; 2903 uint64_t Size = EndOffset - BeginOffset; 2904 unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; 2905 unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; 2906 unsigned NumElements = EndIndex - BeginIndex; 2907 IntegerType *SubIntTy 2908 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2909 2910 Type *OtherPtrTy = NewAI.getType(); 2911 if (VecTy && !IsWholeAlloca) { 2912 if (NumElements == 1) 2913 OtherPtrTy = VecTy->getElementType(); 2914 else 2915 OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); 2916 2917 OtherPtrTy = OtherPtrTy->getPointerTo(); 2918 } else if (IntTy && !IsWholeAlloca) { 2919 OtherPtrTy = SubIntTy->getPointerTo(); 2920 } 2921 2922 Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2923 getName("." + OtherPtr->getName())); 2924 Value *DstPtr = &NewAI; 2925 if (!IsDest) 2926 std::swap(SrcPtr, DstPtr); 2927 2928 Value *Src; 2929 if (VecTy && !IsWholeAlloca && !IsDest) { 2930 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2931 getName(".load")); 2932 Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); 2933 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2934 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2935 getName(".load")); 2936 Src = convertValue(TD, IRB, Src, IntTy); 2937 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2938 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2939 Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); 2940 } else { 2941 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2942 getName(".copyload")); 2943 } 2944 2945 if (VecTy && !IsWholeAlloca && IsDest) { 2946 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2947 getName(".oldload")); 2948 Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); 2949 } else if (IntTy && !IsWholeAlloca && IsDest) { 2950 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2951 getName(".oldload")); 2952 Old = convertValue(TD, IRB, Old, IntTy); 2953 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2954 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2955 Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); 2956 Src = convertValue(TD, IRB, Src, NewAllocaTy); 2957 } 2958 2959 StoreInst *Store = cast<StoreInst>( 2960 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2961 (void)Store; 2962 DEBUG(dbgs() << " to: " << *Store << "\n"); 2963 return !II.isVolatile(); 2964 } 2965 2966 bool visitIntrinsicInst(IntrinsicInst &II) { 2967 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2968 II.getIntrinsicID() == Intrinsic::lifetime_end); 2969 DEBUG(dbgs() << " original: " << II << "\n"); 2970 IRBuilder<> IRB(&II); 2971 assert(II.getArgOperand(1) == OldPtr); 2972 2973 // Record this instruction for deletion. 2974 Pass.DeadInsts.insert(&II); 2975 2976 ConstantInt *Size 2977 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2978 EndOffset - BeginOffset); 2979 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2980 Value *New; 2981 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2982 New = IRB.CreateLifetimeStart(Ptr, Size); 2983 else 2984 New = IRB.CreateLifetimeEnd(Ptr, Size); 2985 2986 DEBUG(dbgs() << " to: " << *New << "\n"); 2987 return true; 2988 } 2989 2990 bool visitPHINode(PHINode &PN) { 2991 DEBUG(dbgs() << " original: " << PN << "\n"); 2992 2993 // We would like to compute a new pointer in only one place, but have it be 2994 // as local as possible to the PHI. To do that, we re-use the location of 2995 // the old pointer, which necessarily must be in the right position to 2996 // dominate the PHI. 2997 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2998 2999 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 3000 // Replace the operands which were using the old pointer. 3001 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 3002 3003 DEBUG(dbgs() << " to: " << PN << "\n"); 3004 deleteIfTriviallyDead(OldPtr); 3005 return false; 3006 } 3007 3008 bool visitSelectInst(SelectInst &SI) { 3009 DEBUG(dbgs() << " original: " << SI << "\n"); 3010 IRBuilder<> IRB(&SI); 3011 3012 // Find the operand we need to rewrite here. 3013 bool IsTrueVal = SI.getTrueValue() == OldPtr; 3014 if (IsTrueVal) 3015 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 3016 else 3017 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 3018 3019 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 3020 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 3021 DEBUG(dbgs() << " to: " << SI << "\n"); 3022 deleteIfTriviallyDead(OldPtr); 3023 return false; 3024 } 3025 3026}; 3027} 3028 3029namespace { 3030/// \brief Visitor to rewrite aggregate loads and stores as scalar. 3031/// 3032/// This pass aggressively rewrites all aggregate loads and stores on 3033/// a particular pointer (or any pointer derived from it which we can identify) 3034/// with scalar loads and stores. 3035class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3036 // Befriend the base class so it can delegate to private visit methods. 3037 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 3038 3039 const DataLayout &TD; 3040 3041 /// Queue of pointer uses to analyze and potentially rewrite. 3042 SmallVector<Use *, 8> Queue; 3043 3044 /// Set to prevent us from cycling with phi nodes and loops. 3045 SmallPtrSet<User *, 8> Visited; 3046 3047 /// The current pointer use being rewritten. This is used to dig up the used 3048 /// value (as opposed to the user). 3049 Use *U; 3050 3051public: 3052 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 3053 3054 /// Rewrite loads and stores through a pointer and all pointers derived from 3055 /// it. 3056 bool rewrite(Instruction &I) { 3057 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3058 enqueueUsers(I); 3059 bool Changed = false; 3060 while (!Queue.empty()) { 3061 U = Queue.pop_back_val(); 3062 Changed |= visit(cast<Instruction>(U->getUser())); 3063 } 3064 return Changed; 3065 } 3066 3067private: 3068 /// Enqueue all the users of the given instruction for further processing. 3069 /// This uses a set to de-duplicate users. 3070 void enqueueUsers(Instruction &I) { 3071 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 3072 ++UI) 3073 if (Visited.insert(*UI)) 3074 Queue.push_back(&UI.getUse()); 3075 } 3076 3077 // Conservative default is to not rewrite anything. 3078 bool visitInstruction(Instruction &I) { return false; } 3079 3080 /// \brief Generic recursive split emission class. 3081 template <typename Derived> 3082 class OpSplitter { 3083 protected: 3084 /// The builder used to form new instructions. 3085 IRBuilder<> IRB; 3086 /// The indices which to be used with insert- or extractvalue to select the 3087 /// appropriate value within the aggregate. 3088 SmallVector<unsigned, 4> Indices; 3089 /// The indices to a GEP instruction which will move Ptr to the correct slot 3090 /// within the aggregate. 3091 SmallVector<Value *, 4> GEPIndices; 3092 /// The base pointer of the original op, used as a base for GEPing the 3093 /// split operations. 3094 Value *Ptr; 3095 3096 /// Initialize the splitter with an insertion point, Ptr and start with a 3097 /// single zero GEP index. 3098 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 3099 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 3100 3101 public: 3102 /// \brief Generic recursive split emission routine. 3103 /// 3104 /// This method recursively splits an aggregate op (load or store) into 3105 /// scalar or vector ops. It splits recursively until it hits a single value 3106 /// and emits that single value operation via the template argument. 3107 /// 3108 /// The logic of this routine relies on GEPs and insertvalue and 3109 /// extractvalue all operating with the same fundamental index list, merely 3110 /// formatted differently (GEPs need actual values). 3111 /// 3112 /// \param Ty The type being split recursively into smaller ops. 3113 /// \param Agg The aggregate value being built up or stored, depending on 3114 /// whether this is splitting a load or a store respectively. 3115 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3116 if (Ty->isSingleValueType()) 3117 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 3118 3119 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3120 unsigned OldSize = Indices.size(); 3121 (void)OldSize; 3122 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3123 ++Idx) { 3124 assert(Indices.size() == OldSize && "Did not return to the old size"); 3125 Indices.push_back(Idx); 3126 GEPIndices.push_back(IRB.getInt32(Idx)); 3127 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3128 GEPIndices.pop_back(); 3129 Indices.pop_back(); 3130 } 3131 return; 3132 } 3133 3134 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3135 unsigned OldSize = Indices.size(); 3136 (void)OldSize; 3137 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3138 ++Idx) { 3139 assert(Indices.size() == OldSize && "Did not return to the old size"); 3140 Indices.push_back(Idx); 3141 GEPIndices.push_back(IRB.getInt32(Idx)); 3142 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3143 GEPIndices.pop_back(); 3144 Indices.pop_back(); 3145 } 3146 return; 3147 } 3148 3149 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3150 } 3151 }; 3152 3153 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3154 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3155 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3156 3157 /// Emit a leaf load of a single value. This is called at the leaves of the 3158 /// recursive emission to actually load values. 3159 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3160 assert(Ty->isSingleValueType()); 3161 // Load the single value and insert it using the indices. 3162 Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, 3163 Name + ".gep"), 3164 Name + ".load"); 3165 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3166 DEBUG(dbgs() << " to: " << *Load << "\n"); 3167 } 3168 }; 3169 3170 bool visitLoadInst(LoadInst &LI) { 3171 assert(LI.getPointerOperand() == *U); 3172 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3173 return false; 3174 3175 // We have an aggregate being loaded, split it apart. 3176 DEBUG(dbgs() << " original: " << LI << "\n"); 3177 LoadOpSplitter Splitter(&LI, *U); 3178 Value *V = UndefValue::get(LI.getType()); 3179 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3180 LI.replaceAllUsesWith(V); 3181 LI.eraseFromParent(); 3182 return true; 3183 } 3184 3185 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3186 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3187 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3188 3189 /// Emit a leaf store of a single value. This is called at the leaves of the 3190 /// recursive emission to actually produce stores. 3191 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3192 assert(Ty->isSingleValueType()); 3193 // Extract the single value and store it using the indices. 3194 Value *Store = IRB.CreateStore( 3195 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3196 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3197 (void)Store; 3198 DEBUG(dbgs() << " to: " << *Store << "\n"); 3199 } 3200 }; 3201 3202 bool visitStoreInst(StoreInst &SI) { 3203 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3204 return false; 3205 Value *V = SI.getValueOperand(); 3206 if (V->getType()->isSingleValueType()) 3207 return false; 3208 3209 // We have an aggregate being stored, split it apart. 3210 DEBUG(dbgs() << " original: " << SI << "\n"); 3211 StoreOpSplitter Splitter(&SI, *U); 3212 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3213 SI.eraseFromParent(); 3214 return true; 3215 } 3216 3217 bool visitBitCastInst(BitCastInst &BC) { 3218 enqueueUsers(BC); 3219 return false; 3220 } 3221 3222 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3223 enqueueUsers(GEPI); 3224 return false; 3225 } 3226 3227 bool visitPHINode(PHINode &PN) { 3228 enqueueUsers(PN); 3229 return false; 3230 } 3231 3232 bool visitSelectInst(SelectInst &SI) { 3233 enqueueUsers(SI); 3234 return false; 3235 } 3236}; 3237} 3238 3239/// \brief Strip aggregate type wrapping. 3240/// 3241/// This removes no-op aggregate types wrapping an underlying type. It will 3242/// strip as many layers of types as it can without changing either the type 3243/// size or the allocated size. 3244static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3245 if (Ty->isSingleValueType()) 3246 return Ty; 3247 3248 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3249 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3250 3251 Type *InnerTy; 3252 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3253 InnerTy = ArrTy->getElementType(); 3254 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3255 const StructLayout *SL = DL.getStructLayout(STy); 3256 unsigned Index = SL->getElementContainingOffset(0); 3257 InnerTy = STy->getElementType(Index); 3258 } else { 3259 return Ty; 3260 } 3261 3262 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3263 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3264 return Ty; 3265 3266 return stripAggregateTypeWrapping(DL, InnerTy); 3267} 3268 3269/// \brief Try to find a partition of the aggregate type passed in for a given 3270/// offset and size. 3271/// 3272/// This recurses through the aggregate type and tries to compute a subtype 3273/// based on the offset and size. When the offset and size span a sub-section 3274/// of an array, it will even compute a new array type for that sub-section, 3275/// and the same for structs. 3276/// 3277/// Note that this routine is very strict and tries to find a partition of the 3278/// type which produces the *exact* right offset and size. It is not forgiving 3279/// when the size or offset cause either end of type-based partition to be off. 3280/// Also, this is a best-effort routine. It is reasonable to give up and not 3281/// return a type if necessary. 3282static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3283 uint64_t Offset, uint64_t Size) { 3284 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3285 return stripAggregateTypeWrapping(TD, Ty); 3286 if (Offset > TD.getTypeAllocSize(Ty) || 3287 (TD.getTypeAllocSize(Ty) - Offset) < Size) 3288 return 0; 3289 3290 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3291 // We can't partition pointers... 3292 if (SeqTy->isPointerTy()) 3293 return 0; 3294 3295 Type *ElementTy = SeqTy->getElementType(); 3296 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3297 uint64_t NumSkippedElements = Offset / ElementSize; 3298 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3299 if (NumSkippedElements >= ArrTy->getNumElements()) 3300 return 0; 3301 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3302 if (NumSkippedElements >= VecTy->getNumElements()) 3303 return 0; 3304 Offset -= NumSkippedElements * ElementSize; 3305 3306 // First check if we need to recurse. 3307 if (Offset > 0 || Size < ElementSize) { 3308 // Bail if the partition ends in a different array element. 3309 if ((Offset + Size) > ElementSize) 3310 return 0; 3311 // Recurse through the element type trying to peel off offset bytes. 3312 return getTypePartition(TD, ElementTy, Offset, Size); 3313 } 3314 assert(Offset == 0); 3315 3316 if (Size == ElementSize) 3317 return stripAggregateTypeWrapping(TD, ElementTy); 3318 assert(Size > ElementSize); 3319 uint64_t NumElements = Size / ElementSize; 3320 if (NumElements * ElementSize != Size) 3321 return 0; 3322 return ArrayType::get(ElementTy, NumElements); 3323 } 3324 3325 StructType *STy = dyn_cast<StructType>(Ty); 3326 if (!STy) 3327 return 0; 3328 3329 const StructLayout *SL = TD.getStructLayout(STy); 3330 if (Offset >= SL->getSizeInBytes()) 3331 return 0; 3332 uint64_t EndOffset = Offset + Size; 3333 if (EndOffset > SL->getSizeInBytes()) 3334 return 0; 3335 3336 unsigned Index = SL->getElementContainingOffset(Offset); 3337 Offset -= SL->getElementOffset(Index); 3338 3339 Type *ElementTy = STy->getElementType(Index); 3340 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3341 if (Offset >= ElementSize) 3342 return 0; // The offset points into alignment padding. 3343 3344 // See if any partition must be contained by the element. 3345 if (Offset > 0 || Size < ElementSize) { 3346 if ((Offset + Size) > ElementSize) 3347 return 0; 3348 return getTypePartition(TD, ElementTy, Offset, Size); 3349 } 3350 assert(Offset == 0); 3351 3352 if (Size == ElementSize) 3353 return stripAggregateTypeWrapping(TD, ElementTy); 3354 3355 StructType::element_iterator EI = STy->element_begin() + Index, 3356 EE = STy->element_end(); 3357 if (EndOffset < SL->getSizeInBytes()) { 3358 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3359 if (Index == EndIndex) 3360 return 0; // Within a single element and its padding. 3361 3362 // Don't try to form "natural" types if the elements don't line up with the 3363 // expected size. 3364 // FIXME: We could potentially recurse down through the last element in the 3365 // sub-struct to find a natural end point. 3366 if (SL->getElementOffset(EndIndex) != EndOffset) 3367 return 0; 3368 3369 assert(Index < EndIndex); 3370 EE = STy->element_begin() + EndIndex; 3371 } 3372 3373 // Try to build up a sub-structure. 3374 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3375 STy->isPacked()); 3376 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3377 if (Size != SubSL->getSizeInBytes()) 3378 return 0; // The sub-struct doesn't have quite the size needed. 3379 3380 return SubTy; 3381} 3382 3383/// \brief Rewrite an alloca partition's users. 3384/// 3385/// This routine drives both of the rewriting goals of the SROA pass. It tries 3386/// to rewrite uses of an alloca partition to be conducive for SSA value 3387/// promotion. If the partition needs a new, more refined alloca, this will 3388/// build that new alloca, preserving as much type information as possible, and 3389/// rewrite the uses of the old alloca to point at the new one and have the 3390/// appropriate new offsets. It also evaluates how successful the rewrite was 3391/// at enabling promotion and if it was successful queues the alloca to be 3392/// promoted. 3393bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3394 AllocaPartitioning &P, 3395 AllocaPartitioning::iterator PI) { 3396 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3397 bool IsLive = false; 3398 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3399 UE = P.use_end(PI); 3400 UI != UE && !IsLive; ++UI) 3401 if (UI->U) 3402 IsLive = true; 3403 if (!IsLive) 3404 return false; // No live uses left of this partition. 3405 3406 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3407 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3408 3409 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3410 DEBUG(dbgs() << " speculating "); 3411 DEBUG(P.print(dbgs(), PI, "")); 3412 Speculator.visitUsers(PI); 3413 3414 // Try to compute a friendly type for this partition of the alloca. This 3415 // won't always succeed, in which case we fall back to a legal integer type 3416 // or an i8 array of an appropriate size. 3417 Type *AllocaTy = 0; 3418 if (Type *PartitionTy = P.getCommonType(PI)) 3419 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3420 AllocaTy = PartitionTy; 3421 if (!AllocaTy) 3422 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3423 PI->BeginOffset, AllocaSize)) 3424 AllocaTy = PartitionTy; 3425 if ((!AllocaTy || 3426 (AllocaTy->isArrayTy() && 3427 AllocaTy->getArrayElementType()->isIntegerTy())) && 3428 TD->isLegalInteger(AllocaSize * 8)) 3429 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3430 if (!AllocaTy) 3431 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3432 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3433 3434 // Check for the case where we're going to rewrite to a new alloca of the 3435 // exact same type as the original, and with the same access offsets. In that 3436 // case, re-use the existing alloca, but still run through the rewriter to 3437 // performe phi and select speculation. 3438 AllocaInst *NewAI; 3439 if (AllocaTy == AI.getAllocatedType()) { 3440 assert(PI->BeginOffset == 0 && 3441 "Non-zero begin offset but same alloca type"); 3442 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3443 NewAI = &AI; 3444 } else { 3445 unsigned Alignment = AI.getAlignment(); 3446 if (!Alignment) { 3447 // The minimum alignment which users can rely on when the explicit 3448 // alignment is omitted or zero is that required by the ABI for this 3449 // type. 3450 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3451 } 3452 Alignment = MinAlign(Alignment, PI->BeginOffset); 3453 // If we will get at least this much alignment from the type alone, leave 3454 // the alloca's alignment unconstrained. 3455 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3456 Alignment = 0; 3457 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3458 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3459 &AI); 3460 ++NumNewAllocas; 3461 } 3462 3463 DEBUG(dbgs() << "Rewriting alloca partition " 3464 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3465 << *NewAI << "\n"); 3466 3467 // Track the high watermark of the post-promotion worklist. We will reset it 3468 // to this point if the alloca is not in fact scheduled for promotion. 3469 unsigned PPWOldSize = PostPromotionWorklist.size(); 3470 3471 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3472 PI->BeginOffset, PI->EndOffset); 3473 DEBUG(dbgs() << " rewriting "); 3474 DEBUG(P.print(dbgs(), PI, "")); 3475 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3476 if (Promotable) { 3477 DEBUG(dbgs() << " and queuing for promotion\n"); 3478 PromotableAllocas.push_back(NewAI); 3479 } else if (NewAI != &AI) { 3480 // If we can't promote the alloca, iterate on it to check for new 3481 // refinements exposed by splitting the current alloca. Don't iterate on an 3482 // alloca which didn't actually change and didn't get promoted. 3483 Worklist.insert(NewAI); 3484 } 3485 3486 // Drop any post-promotion work items if promotion didn't happen. 3487 if (!Promotable) 3488 while (PostPromotionWorklist.size() > PPWOldSize) 3489 PostPromotionWorklist.pop_back(); 3490 3491 return true; 3492} 3493 3494/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3495bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3496 bool Changed = false; 3497 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3498 ++PI) 3499 Changed |= rewriteAllocaPartition(AI, P, PI); 3500 3501 return Changed; 3502} 3503 3504/// \brief Analyze an alloca for SROA. 3505/// 3506/// This analyzes the alloca to ensure we can reason about it, builds 3507/// a partitioning of the alloca, and then hands it off to be split and 3508/// rewritten as needed. 3509bool SROA::runOnAlloca(AllocaInst &AI) { 3510 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3511 ++NumAllocasAnalyzed; 3512 3513 // Special case dead allocas, as they're trivial. 3514 if (AI.use_empty()) { 3515 AI.eraseFromParent(); 3516 return true; 3517 } 3518 3519 // Skip alloca forms that this analysis can't handle. 3520 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3521 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3522 return false; 3523 3524 bool Changed = false; 3525 3526 // First, split any FCA loads and stores touching this alloca to promote 3527 // better splitting and promotion opportunities. 3528 AggLoadStoreRewriter AggRewriter(*TD); 3529 Changed |= AggRewriter.rewrite(AI); 3530 3531 // Build the partition set using a recursive instruction-visiting builder. 3532 AllocaPartitioning P(*TD, AI); 3533 DEBUG(P.print(dbgs())); 3534 if (P.isEscaped()) 3535 return Changed; 3536 3537 // Delete all the dead users of this alloca before splitting and rewriting it. 3538 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3539 DE = P.dead_user_end(); 3540 DI != DE; ++DI) { 3541 Changed = true; 3542 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3543 DeadInsts.insert(*DI); 3544 } 3545 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3546 DE = P.dead_op_end(); 3547 DO != DE; ++DO) { 3548 Value *OldV = **DO; 3549 // Clobber the use with an undef value. 3550 **DO = UndefValue::get(OldV->getType()); 3551 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3552 if (isInstructionTriviallyDead(OldI)) { 3553 Changed = true; 3554 DeadInsts.insert(OldI); 3555 } 3556 } 3557 3558 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3559 if (P.begin() == P.end()) 3560 return Changed; 3561 3562 return splitAlloca(AI, P) || Changed; 3563} 3564 3565/// \brief Delete the dead instructions accumulated in this run. 3566/// 3567/// Recursively deletes the dead instructions we've accumulated. This is done 3568/// at the very end to maximize locality of the recursive delete and to 3569/// minimize the problems of invalidated instruction pointers as such pointers 3570/// are used heavily in the intermediate stages of the algorithm. 3571/// 3572/// We also record the alloca instructions deleted here so that they aren't 3573/// subsequently handed to mem2reg to promote. 3574void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3575 while (!DeadInsts.empty()) { 3576 Instruction *I = DeadInsts.pop_back_val(); 3577 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3578 3579 I->replaceAllUsesWith(UndefValue::get(I->getType())); 3580 3581 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3582 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3583 // Zero out the operand and see if it becomes trivially dead. 3584 *OI = 0; 3585 if (isInstructionTriviallyDead(U)) 3586 DeadInsts.insert(U); 3587 } 3588 3589 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3590 DeletedAllocas.insert(AI); 3591 3592 ++NumDeleted; 3593 I->eraseFromParent(); 3594 } 3595} 3596 3597/// \brief Promote the allocas, using the best available technique. 3598/// 3599/// This attempts to promote whatever allocas have been identified as viable in 3600/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3601/// If there is a domtree available, we attempt to promote using the full power 3602/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3603/// based on the SSAUpdater utilities. This function returns whether any 3604/// promotion occured. 3605bool SROA::promoteAllocas(Function &F) { 3606 if (PromotableAllocas.empty()) 3607 return false; 3608 3609 NumPromoted += PromotableAllocas.size(); 3610 3611 if (DT && !ForceSSAUpdater) { 3612 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3613 PromoteMemToReg(PromotableAllocas, *DT); 3614 PromotableAllocas.clear(); 3615 return true; 3616 } 3617 3618 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3619 SSAUpdater SSA; 3620 DIBuilder DIB(*F.getParent()); 3621 SmallVector<Instruction*, 64> Insts; 3622 3623 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3624 AllocaInst *AI = PromotableAllocas[Idx]; 3625 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3626 UI != UE;) { 3627 Instruction *I = cast<Instruction>(*UI++); 3628 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3629 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3630 // leading to them) here. Eventually it should use them to optimize the 3631 // scalar values produced. 3632 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3633 assert(onlyUsedByLifetimeMarkers(I) && 3634 "Found a bitcast used outside of a lifetime marker."); 3635 while (!I->use_empty()) 3636 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3637 I->eraseFromParent(); 3638 continue; 3639 } 3640 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3641 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3642 II->getIntrinsicID() == Intrinsic::lifetime_end); 3643 II->eraseFromParent(); 3644 continue; 3645 } 3646 3647 Insts.push_back(I); 3648 } 3649 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3650 Insts.clear(); 3651 } 3652 3653 PromotableAllocas.clear(); 3654 return true; 3655} 3656 3657namespace { 3658 /// \brief A predicate to test whether an alloca belongs to a set. 3659 class IsAllocaInSet { 3660 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3661 const SetType &Set; 3662 3663 public: 3664 typedef AllocaInst *argument_type; 3665 3666 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3667 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3668 }; 3669} 3670 3671bool SROA::runOnFunction(Function &F) { 3672 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3673 C = &F.getContext(); 3674 TD = getAnalysisIfAvailable<DataLayout>(); 3675 if (!TD) { 3676 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3677 return false; 3678 } 3679 DT = getAnalysisIfAvailable<DominatorTree>(); 3680 3681 BasicBlock &EntryBB = F.getEntryBlock(); 3682 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3683 I != E; ++I) 3684 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3685 Worklist.insert(AI); 3686 3687 bool Changed = false; 3688 // A set of deleted alloca instruction pointers which should be removed from 3689 // the list of promotable allocas. 3690 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3691 3692 do { 3693 while (!Worklist.empty()) { 3694 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3695 deleteDeadInstructions(DeletedAllocas); 3696 3697 // Remove the deleted allocas from various lists so that we don't try to 3698 // continue processing them. 3699 if (!DeletedAllocas.empty()) { 3700 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3701 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3702 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3703 PromotableAllocas.end(), 3704 IsAllocaInSet(DeletedAllocas)), 3705 PromotableAllocas.end()); 3706 DeletedAllocas.clear(); 3707 } 3708 } 3709 3710 Changed |= promoteAllocas(F); 3711 3712 Worklist = PostPromotionWorklist; 3713 PostPromotionWorklist.clear(); 3714 } while (!Worklist.empty()); 3715 3716 return Changed; 3717} 3718 3719void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3720 if (RequiresDomTree) 3721 AU.addRequired<DominatorTree>(); 3722 AU.setPreservesCFG(); 3723} 3724