SROA.cpp revision 2c39b15073db81d93bb629303915b7d7e5d088dc
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/Constants.h" 29#include "llvm/DIBuilder.h" 30#include "llvm/DebugInfo.h" 31#include "llvm/DerivedTypes.h" 32#include "llvm/Function.h" 33#include "llvm/IRBuilder.h" 34#include "llvm/Instructions.h" 35#include "llvm/IntrinsicInst.h" 36#include "llvm/LLVMContext.h" 37#include "llvm/Module.h" 38#include "llvm/Operator.h" 39#include "llvm/Pass.h" 40#include "llvm/ADT/SetVector.h" 41#include "llvm/ADT/SmallVector.h" 42#include "llvm/ADT/Statistic.h" 43#include "llvm/ADT/STLExtras.h" 44#include "llvm/Analysis/Dominators.h" 45#include "llvm/Analysis/Loads.h" 46#include "llvm/Analysis/ValueTracking.h" 47#include "llvm/Support/CommandLine.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/GetElementPtrTypeIterator.h" 51#include "llvm/Support/InstVisitor.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54#include "llvm/DataLayout.h" 55#include "llvm/Transforms/Utils/Local.h" 56#include "llvm/Transforms/Utils/PromoteMemToReg.h" 57#include "llvm/Transforms/Utils/SSAUpdater.h" 58using namespace llvm; 59 60STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 61STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 62STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 63STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 64STATISTIC(NumDeleted, "Number of instructions deleted"); 65STATISTIC(NumVectorized, "Number of vectorized aggregates"); 66 67/// Hidden option to force the pass to not use DomTree and mem2reg, instead 68/// forming SSA values through the SSAUpdater infrastructure. 69static cl::opt<bool> 70ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 71 72namespace { 73/// \brief Alloca partitioning representation. 74/// 75/// This class represents a partitioning of an alloca into slices, and 76/// information about the nature of uses of each slice of the alloca. The goal 77/// is that this information is sufficient to decide if and how to split the 78/// alloca apart and replace slices with scalars. It is also intended that this 79/// structure can capture the relevant information needed both to decide about 80/// and to enact these transformations. 81class AllocaPartitioning { 82public: 83 /// \brief A common base class for representing a half-open byte range. 84 struct ByteRange { 85 /// \brief The beginning offset of the range. 86 uint64_t BeginOffset; 87 88 /// \brief The ending offset, not included in the range. 89 uint64_t EndOffset; 90 91 ByteRange() : BeginOffset(), EndOffset() {} 92 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 93 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 94 95 /// \brief Support for ordering ranges. 96 /// 97 /// This provides an ordering over ranges such that start offsets are 98 /// always increasing, and within equal start offsets, the end offsets are 99 /// decreasing. Thus the spanning range comes first in a cluster with the 100 /// same start position. 101 bool operator<(const ByteRange &RHS) const { 102 if (BeginOffset < RHS.BeginOffset) return true; 103 if (BeginOffset > RHS.BeginOffset) return false; 104 if (EndOffset > RHS.EndOffset) return true; 105 return false; 106 } 107 108 /// \brief Support comparison with a single offset to allow binary searches. 109 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 110 return LHS.BeginOffset < RHSOffset; 111 } 112 113 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 114 const ByteRange &RHS) { 115 return LHSOffset < RHS.BeginOffset; 116 } 117 118 bool operator==(const ByteRange &RHS) const { 119 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 120 } 121 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 122 }; 123 124 /// \brief A partition of an alloca. 125 /// 126 /// This structure represents a contiguous partition of the alloca. These are 127 /// formed by examining the uses of the alloca. During formation, they may 128 /// overlap but once an AllocaPartitioning is built, the Partitions within it 129 /// are all disjoint. 130 struct Partition : public ByteRange { 131 /// \brief Whether this partition is splittable into smaller partitions. 132 /// 133 /// We flag partitions as splittable when they are formed entirely due to 134 /// accesses by trivially splittable operations such as memset and memcpy. 135 /// 136 /// FIXME: At some point we should consider loads and stores of FCAs to be 137 /// splittable and eagerly split them into scalar values. 138 bool IsSplittable; 139 140 /// \brief Test whether a partition has been marked as dead. 141 bool isDead() const { 142 if (BeginOffset == UINT64_MAX) { 143 assert(EndOffset == UINT64_MAX); 144 return true; 145 } 146 return false; 147 } 148 149 /// \brief Kill a partition. 150 /// This is accomplished by setting both its beginning and end offset to 151 /// the maximum possible value. 152 void kill() { 153 assert(!isDead() && "He's Dead, Jim!"); 154 BeginOffset = EndOffset = UINT64_MAX; 155 } 156 157 Partition() : ByteRange(), IsSplittable() {} 158 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 159 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 160 }; 161 162 /// \brief A particular use of a partition of the alloca. 163 /// 164 /// This structure is used to associate uses of a partition with it. They 165 /// mark the range of bytes which are referenced by a particular instruction, 166 /// and includes a handle to the user itself and the pointer value in use. 167 /// The bounds of these uses are determined by intersecting the bounds of the 168 /// memory use itself with a particular partition. As a consequence there is 169 /// intentionally overlap between various uses of the same partition. 170 struct PartitionUse : public ByteRange { 171 /// \brief The use in question. Provides access to both user and used value. 172 /// 173 /// Note that this may be null if the partition use is *dead*, that is, it 174 /// should be ignored. 175 Use *U; 176 177 PartitionUse() : ByteRange(), U() {} 178 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) 179 : ByteRange(BeginOffset, EndOffset), U(U) {} 180 }; 181 182 /// \brief Construct a partitioning of a particular alloca. 183 /// 184 /// Construction does most of the work for partitioning the alloca. This 185 /// performs the necessary walks of users and builds a partitioning from it. 186 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 187 188 /// \brief Test whether a pointer to the allocation escapes our analysis. 189 /// 190 /// If this is true, the partitioning is never fully built and should be 191 /// ignored. 192 bool isEscaped() const { return PointerEscapingInstr; } 193 194 /// \brief Support for iterating over the partitions. 195 /// @{ 196 typedef SmallVectorImpl<Partition>::iterator iterator; 197 iterator begin() { return Partitions.begin(); } 198 iterator end() { return Partitions.end(); } 199 200 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 201 const_iterator begin() const { return Partitions.begin(); } 202 const_iterator end() const { return Partitions.end(); } 203 /// @} 204 205 /// \brief Support for iterating over and manipulating a particular 206 /// partition's uses. 207 /// 208 /// The iteration support provided for uses is more limited, but also 209 /// includes some manipulation routines to support rewriting the uses of 210 /// partitions during SROA. 211 /// @{ 212 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 213 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 214 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 215 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 216 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 217 218 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 219 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 220 const_use_iterator use_begin(const_iterator I) const { 221 return Uses[I - begin()].begin(); 222 } 223 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 224 const_use_iterator use_end(const_iterator I) const { 225 return Uses[I - begin()].end(); 226 } 227 228 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 229 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 230 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 231 return Uses[PIdx][UIdx]; 232 } 233 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 234 return Uses[I - begin()][UIdx]; 235 } 236 237 void use_push_back(unsigned Idx, const PartitionUse &PU) { 238 Uses[Idx].push_back(PU); 239 } 240 void use_push_back(const_iterator I, const PartitionUse &PU) { 241 Uses[I - begin()].push_back(PU); 242 } 243 /// @} 244 245 /// \brief Allow iterating the dead users for this alloca. 246 /// 247 /// These are instructions which will never actually use the alloca as they 248 /// are outside the allocated range. They are safe to replace with undef and 249 /// delete. 250 /// @{ 251 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 252 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 253 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 254 /// @} 255 256 /// \brief Allow iterating the dead expressions referring to this alloca. 257 /// 258 /// These are operands which have cannot actually be used to refer to the 259 /// alloca as they are outside its range and the user doesn't correct for 260 /// that. These mostly consist of PHI node inputs and the like which we just 261 /// need to replace with undef. 262 /// @{ 263 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 264 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 265 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 266 /// @} 267 268 /// \brief MemTransferInst auxiliary data. 269 /// This struct provides some auxiliary data about memory transfer 270 /// intrinsics such as memcpy and memmove. These intrinsics can use two 271 /// different ranges within the same alloca, and provide other challenges to 272 /// correctly represent. We stash extra data to help us untangle this 273 /// after the partitioning is complete. 274 struct MemTransferOffsets { 275 /// The destination begin and end offsets when the destination is within 276 /// this alloca. If the end offset is zero the destination is not within 277 /// this alloca. 278 uint64_t DestBegin, DestEnd; 279 280 /// The source begin and end offsets when the source is within this alloca. 281 /// If the end offset is zero, the source is not within this alloca. 282 uint64_t SourceBegin, SourceEnd; 283 284 /// Flag for whether an alloca is splittable. 285 bool IsSplittable; 286 }; 287 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 288 return MemTransferInstData.lookup(&II); 289 } 290 291 /// \brief Map from a PHI or select operand back to a partition. 292 /// 293 /// When manipulating PHI nodes or selects, they can use more than one 294 /// partition of an alloca. We store a special mapping to allow finding the 295 /// partition referenced by each of these operands, if any. 296 iterator findPartitionForPHIOrSelectOperand(Use *U) { 297 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 298 = PHIOrSelectOpMap.find(U); 299 if (MapIt == PHIOrSelectOpMap.end()) 300 return end(); 301 302 return begin() + MapIt->second.first; 303 } 304 305 /// \brief Map from a PHI or select operand back to the specific use of 306 /// a partition. 307 /// 308 /// Similar to mapping these operands back to the partitions, this maps 309 /// directly to the use structure of that partition. 310 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 311 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 312 = PHIOrSelectOpMap.find(U); 313 assert(MapIt != PHIOrSelectOpMap.end()); 314 return Uses[MapIt->second.first].begin() + MapIt->second.second; 315 } 316 317 /// \brief Compute a common type among the uses of a particular partition. 318 /// 319 /// This routines walks all of the uses of a particular partition and tries 320 /// to find a common type between them. Untyped operations such as memset and 321 /// memcpy are ignored. 322 Type *getCommonType(iterator I) const; 323 324#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 325 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 326 void printUsers(raw_ostream &OS, const_iterator I, 327 StringRef Indent = " ") const; 328 void print(raw_ostream &OS) const; 329 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 330 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 331#endif 332 333private: 334 template <typename DerivedT, typename RetT = void> class BuilderBase; 335 class PartitionBuilder; 336 friend class AllocaPartitioning::PartitionBuilder; 337 class UseBuilder; 338 friend class AllocaPartitioning::UseBuilder; 339 340#ifndef NDEBUG 341 /// \brief Handle to alloca instruction to simplify method interfaces. 342 AllocaInst &AI; 343#endif 344 345 /// \brief The instruction responsible for this alloca having no partitioning. 346 /// 347 /// When an instruction (potentially) escapes the pointer to the alloca, we 348 /// store a pointer to that here and abort trying to partition the alloca. 349 /// This will be null if the alloca is partitioned successfully. 350 Instruction *PointerEscapingInstr; 351 352 /// \brief The partitions of the alloca. 353 /// 354 /// We store a vector of the partitions over the alloca here. This vector is 355 /// sorted by increasing begin offset, and then by decreasing end offset. See 356 /// the Partition inner class for more details. Initially (during 357 /// construction) there are overlaps, but we form a disjoint sequence of 358 /// partitions while finishing construction and a fully constructed object is 359 /// expected to always have this as a disjoint space. 360 SmallVector<Partition, 8> Partitions; 361 362 /// \brief The uses of the partitions. 363 /// 364 /// This is essentially a mapping from each partition to a list of uses of 365 /// that partition. The mapping is done with a Uses vector that has the exact 366 /// same number of entries as the partition vector. Each entry is itself 367 /// a vector of the uses. 368 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 369 370 /// \brief Instructions which will become dead if we rewrite the alloca. 371 /// 372 /// Note that these are not separated by partition. This is because we expect 373 /// a partitioned alloca to be completely rewritten or not rewritten at all. 374 /// If rewritten, all these instructions can simply be removed and replaced 375 /// with undef as they come from outside of the allocated space. 376 SmallVector<Instruction *, 8> DeadUsers; 377 378 /// \brief Operands which will become dead if we rewrite the alloca. 379 /// 380 /// These are operands that in their particular use can be replaced with 381 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 382 /// to PHI nodes and the like. They aren't entirely dead (there might be 383 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 384 /// want to swap this particular input for undef to simplify the use lists of 385 /// the alloca. 386 SmallVector<Use *, 8> DeadOperands; 387 388 /// \brief The underlying storage for auxiliary memcpy and memset info. 389 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 390 391 /// \brief A side datastructure used when building up the partitions and uses. 392 /// 393 /// This mapping is only really used during the initial building of the 394 /// partitioning so that we can retain information about PHI and select nodes 395 /// processed. 396 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 397 398 /// \brief Auxiliary information for particular PHI or select operands. 399 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 400 401 /// \brief A utility routine called from the constructor. 402 /// 403 /// This does what it says on the tin. It is the key of the alloca partition 404 /// splitting and merging. After it is called we have the desired disjoint 405 /// collection of partitions. 406 void splitAndMergePartitions(); 407}; 408} 409 410template <typename DerivedT, typename RetT> 411class AllocaPartitioning::BuilderBase 412 : public InstVisitor<DerivedT, RetT> { 413public: 414 BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 415 : TD(TD), 416 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 417 P(P) { 418 enqueueUsers(AI, 0); 419 } 420 421protected: 422 const DataLayout &TD; 423 const uint64_t AllocSize; 424 AllocaPartitioning &P; 425 426 SmallPtrSet<Use *, 8> VisitedUses; 427 428 struct OffsetUse { 429 Use *U; 430 int64_t Offset; 431 }; 432 SmallVector<OffsetUse, 8> Queue; 433 434 // The active offset and use while visiting. 435 Use *U; 436 int64_t Offset; 437 438 void enqueueUsers(Instruction &I, int64_t UserOffset) { 439 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); 440 UI != UE; ++UI) { 441 if (VisitedUses.insert(&UI.getUse())) { 442 OffsetUse OU = { &UI.getUse(), UserOffset }; 443 Queue.push_back(OU); 444 } 445 } 446 } 447 448 bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) { 449 GEPOffset = Offset; 450 unsigned int AS = GEPI.getPointerAddressSpace(); 451 for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); 452 GTI != GTE; ++GTI) { 453 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 454 if (!OpC) 455 return false; 456 if (OpC->isZero()) 457 continue; 458 459 // Handle a struct index, which adds its field offset to the pointer. 460 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 461 unsigned ElementIdx = OpC->getZExtValue(); 462 const StructLayout *SL = TD.getStructLayout(STy); 463 uint64_t ElementOffset = SL->getElementOffset(ElementIdx); 464 // Check that we can continue to model this GEP in a signed 64-bit offset. 465 if (ElementOffset > INT64_MAX || 466 (GEPOffset >= 0 && 467 ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) { 468 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 469 << "what can be represented in an int64_t!\n" 470 << " alloca: " << P.AI << "\n"); 471 return false; 472 } 473 if (GEPOffset < 0) 474 GEPOffset = ElementOffset + (uint64_t)-GEPOffset; 475 else 476 GEPOffset += ElementOffset; 477 continue; 478 } 479 480 APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits(AS)); 481 Index *= APInt(Index.getBitWidth(), 482 TD.getTypeAllocSize(GTI.getIndexedType())); 483 Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset, 484 /*isSigned*/true); 485 // Check if the result can be stored in our int64_t offset. 486 if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) { 487 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 488 << "what can be represented in an int64_t!\n" 489 << " alloca: " << P.AI << "\n"); 490 return false; 491 } 492 493 GEPOffset = Index.getSExtValue(); 494 } 495 return true; 496 } 497 498 Value *foldSelectInst(SelectInst &SI) { 499 // If the condition being selected on is a constant or the same value is 500 // being selected between, fold the select. Yes this does (rarely) happen 501 // early on. 502 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 503 return SI.getOperand(1+CI->isZero()); 504 if (SI.getOperand(1) == SI.getOperand(2)) { 505 assert(*U == SI.getOperand(1)); 506 return SI.getOperand(1); 507 } 508 return 0; 509 } 510}; 511 512/// \brief Builder for the alloca partitioning. 513/// 514/// This class builds an alloca partitioning by recursively visiting the uses 515/// of an alloca and splitting the partitions for each load and store at each 516/// offset. 517class AllocaPartitioning::PartitionBuilder 518 : public BuilderBase<PartitionBuilder, bool> { 519 friend class InstVisitor<PartitionBuilder, bool>; 520 521 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 522 523public: 524 PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 525 : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {} 526 527 /// \brief Run the builder over the allocation. 528 bool operator()() { 529 // Note that we have to re-evaluate size on each trip through the loop as 530 // the queue grows at the tail. 531 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 532 U = Queue[Idx].U; 533 Offset = Queue[Idx].Offset; 534 if (!visit(cast<Instruction>(U->getUser()))) 535 return false; 536 } 537 return true; 538 } 539 540private: 541 bool markAsEscaping(Instruction &I) { 542 P.PointerEscapingInstr = &I; 543 return false; 544 } 545 546 void insertUse(Instruction &I, int64_t Offset, uint64_t Size, 547 bool IsSplittable = false) { 548 // Completely skip uses which have a zero size or don't overlap the 549 // allocation. 550 if (Size == 0 || 551 (Offset >= 0 && (uint64_t)Offset >= AllocSize) || 552 (Offset < 0 && (uint64_t)-Offset >= Size)) { 553 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 554 << " which starts past the end of the " << AllocSize 555 << " byte alloca:\n" 556 << " alloca: " << P.AI << "\n" 557 << " use: " << I << "\n"); 558 return; 559 } 560 561 // Clamp the start to the beginning of the allocation. 562 if (Offset < 0) { 563 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 564 << " to start at the beginning of the alloca:\n" 565 << " alloca: " << P.AI << "\n" 566 << " use: " << I << "\n"); 567 Size -= (uint64_t)-Offset; 568 Offset = 0; 569 } 570 571 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 572 573 // Clamp the end offset to the end of the allocation. Note that this is 574 // formulated to handle even the case where "BeginOffset + Size" overflows. 575 assert(AllocSize >= BeginOffset); // Established above. 576 if (Size > AllocSize - BeginOffset) { 577 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 578 << " to remain within the " << AllocSize << " byte alloca:\n" 579 << " alloca: " << P.AI << "\n" 580 << " use: " << I << "\n"); 581 EndOffset = AllocSize; 582 } 583 584 Partition New(BeginOffset, EndOffset, IsSplittable); 585 P.Partitions.push_back(New); 586 } 587 588 bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 589 uint64_t Size = TD.getTypeStoreSize(Ty); 590 591 // If this memory access can be shown to *statically* extend outside the 592 // bounds of of the allocation, it's behavior is undefined, so simply 593 // ignore it. Note that this is more strict than the generic clamping 594 // behavior of insertUse. We also try to handle cases which might run the 595 // risk of overflow. 596 // FIXME: We should instead consider the pointer to have escaped if this 597 // function is being instrumented for addressing bugs or race conditions. 598 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 599 Size > (AllocSize - (uint64_t)Offset)) { 600 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 601 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 602 << " which extends past the end of the " << AllocSize 603 << " byte alloca:\n" 604 << " alloca: " << P.AI << "\n" 605 << " use: " << I << "\n"); 606 return true; 607 } 608 609 insertUse(I, Offset, Size); 610 return true; 611 } 612 613 bool visitBitCastInst(BitCastInst &BC) { 614 enqueueUsers(BC, Offset); 615 return true; 616 } 617 618 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 619 int64_t GEPOffset; 620 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 621 return markAsEscaping(GEPI); 622 623 enqueueUsers(GEPI, GEPOffset); 624 return true; 625 } 626 627 bool visitLoadInst(LoadInst &LI) { 628 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 629 "All simple FCA loads should have been pre-split"); 630 return handleLoadOrStore(LI.getType(), LI, Offset); 631 } 632 633 bool visitStoreInst(StoreInst &SI) { 634 Value *ValOp = SI.getValueOperand(); 635 if (ValOp == *U) 636 return markAsEscaping(SI); 637 638 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 639 "All simple FCA stores should have been pre-split"); 640 return handleLoadOrStore(ValOp->getType(), SI, Offset); 641 } 642 643 644 bool visitMemSetInst(MemSetInst &II) { 645 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 646 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 647 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 648 insertUse(II, Offset, Size, Length); 649 return true; 650 } 651 652 bool visitMemTransferInst(MemTransferInst &II) { 653 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 654 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 655 if (!Size) 656 // Zero-length mem transfer intrinsics can be ignored entirely. 657 return true; 658 659 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 660 661 // Only intrinsics with a constant length can be split. 662 Offsets.IsSplittable = Length; 663 664 if (*U == II.getRawDest()) { 665 Offsets.DestBegin = Offset; 666 Offsets.DestEnd = Offset + Size; 667 } 668 if (*U == II.getRawSource()) { 669 Offsets.SourceBegin = Offset; 670 Offsets.SourceEnd = Offset + Size; 671 } 672 673 // If we have set up end offsets for both the source and the destination, 674 // we have found both sides of this transfer pointing at the same alloca. 675 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 676 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 677 unsigned PrevIdx = MemTransferPartitionMap[&II]; 678 679 // Check if the begin offsets match and this is a non-volatile transfer. 680 // In that case, we can completely elide the transfer. 681 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 682 P.Partitions[PrevIdx].kill(); 683 return true; 684 } 685 686 // Otherwise we have an offset transfer within the same alloca. We can't 687 // split those. 688 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 689 } else if (SeenBothEnds) { 690 // Handle the case where this exact use provides both ends of the 691 // operation. 692 assert(II.getRawDest() == II.getRawSource()); 693 694 // For non-volatile transfers this is a no-op. 695 if (!II.isVolatile()) 696 return true; 697 698 // Otherwise just suppress splitting. 699 Offsets.IsSplittable = false; 700 } 701 702 703 // Insert the use now that we've fixed up the splittable nature. 704 insertUse(II, Offset, Size, Offsets.IsSplittable); 705 706 // Setup the mapping from intrinsic to partition of we've not seen both 707 // ends of this transfer. 708 if (!SeenBothEnds) { 709 unsigned NewIdx = P.Partitions.size() - 1; 710 bool Inserted 711 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 712 assert(Inserted && 713 "Already have intrinsic in map but haven't seen both ends"); 714 (void)Inserted; 715 } 716 717 return true; 718 } 719 720 // Disable SRoA for any intrinsics except for lifetime invariants. 721 // FIXME: What about debug instrinsics? This matches old behavior, but 722 // doesn't make sense. 723 bool visitIntrinsicInst(IntrinsicInst &II) { 724 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 725 II.getIntrinsicID() == Intrinsic::lifetime_end) { 726 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 727 uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue()); 728 insertUse(II, Offset, Size, true); 729 return true; 730 } 731 732 return markAsEscaping(II); 733 } 734 735 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 736 // We consider any PHI or select that results in a direct load or store of 737 // the same offset to be a viable use for partitioning purposes. These uses 738 // are considered unsplittable and the size is the maximum loaded or stored 739 // size. 740 SmallPtrSet<Instruction *, 4> Visited; 741 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 742 Visited.insert(Root); 743 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 744 // If there are no loads or stores, the access is dead. We mark that as 745 // a size zero access. 746 Size = 0; 747 do { 748 Instruction *I, *UsedI; 749 llvm::tie(UsedI, I) = Uses.pop_back_val(); 750 751 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 752 Size = std::max(Size, TD.getTypeStoreSize(LI->getType())); 753 continue; 754 } 755 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 756 Value *Op = SI->getOperand(0); 757 if (Op == UsedI) 758 return SI; 759 Size = std::max(Size, TD.getTypeStoreSize(Op->getType())); 760 continue; 761 } 762 763 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 764 if (!GEP->hasAllZeroIndices()) 765 return GEP; 766 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 767 !isa<SelectInst>(I)) { 768 return I; 769 } 770 771 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 772 ++UI) 773 if (Visited.insert(cast<Instruction>(*UI))) 774 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 775 } while (!Uses.empty()); 776 777 return 0; 778 } 779 780 bool visitPHINode(PHINode &PN) { 781 // See if we already have computed info on this node. 782 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 783 if (PHIInfo.first) { 784 PHIInfo.second = true; 785 insertUse(PN, Offset, PHIInfo.first); 786 return true; 787 } 788 789 // Check for an unsafe use of the PHI node. 790 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 791 return markAsEscaping(*EscapingI); 792 793 insertUse(PN, Offset, PHIInfo.first); 794 return true; 795 } 796 797 bool visitSelectInst(SelectInst &SI) { 798 if (Value *Result = foldSelectInst(SI)) { 799 if (Result == *U) 800 // If the result of the constant fold will be the pointer, recurse 801 // through the select as if we had RAUW'ed it. 802 enqueueUsers(SI, Offset); 803 804 return true; 805 } 806 807 // See if we already have computed info on this node. 808 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 809 if (SelectInfo.first) { 810 SelectInfo.second = true; 811 insertUse(SI, Offset, SelectInfo.first); 812 return true; 813 } 814 815 // Check for an unsafe use of the PHI node. 816 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 817 return markAsEscaping(*EscapingI); 818 819 insertUse(SI, Offset, SelectInfo.first); 820 return true; 821 } 822 823 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 824 bool visitInstruction(Instruction &I) { return markAsEscaping(I); } 825}; 826 827 828/// \brief Use adder for the alloca partitioning. 829/// 830/// This class adds the uses of an alloca to all of the partitions which they 831/// use. For splittable partitions, this can end up doing essentially a linear 832/// walk of the partitions, but the number of steps remains bounded by the 833/// total result instruction size: 834/// - The number of partitions is a result of the number unsplittable 835/// instructions using the alloca. 836/// - The number of users of each partition is at worst the total number of 837/// splittable instructions using the alloca. 838/// Thus we will produce N * M instructions in the end, where N are the number 839/// of unsplittable uses and M are the number of splittable. This visitor does 840/// the exact same number of updates to the partitioning. 841/// 842/// In the more common case, this visitor will leverage the fact that the 843/// partition space is pre-sorted, and do a logarithmic search for the 844/// partition needed, making the total visit a classical ((N + M) * log(N)) 845/// complexity operation. 846class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> { 847 friend class InstVisitor<UseBuilder>; 848 849 /// \brief Set to de-duplicate dead instructions found in the use walk. 850 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 851 852public: 853 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 854 : BuilderBase<UseBuilder>(TD, AI, P) {} 855 856 /// \brief Run the builder over the allocation. 857 void operator()() { 858 // Note that we have to re-evaluate size on each trip through the loop as 859 // the queue grows at the tail. 860 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 861 U = Queue[Idx].U; 862 Offset = Queue[Idx].Offset; 863 this->visit(cast<Instruction>(U->getUser())); 864 } 865 } 866 867private: 868 void markAsDead(Instruction &I) { 869 if (VisitedDeadInsts.insert(&I)) 870 P.DeadUsers.push_back(&I); 871 } 872 873 void insertUse(Instruction &User, int64_t Offset, uint64_t Size) { 874 // If the use has a zero size or extends outside of the allocation, record 875 // it as a dead use for elimination later. 876 if (Size == 0 || (uint64_t)Offset >= AllocSize || 877 (Offset < 0 && (uint64_t)-Offset >= Size)) 878 return markAsDead(User); 879 880 // Clamp the start to the beginning of the allocation. 881 if (Offset < 0) { 882 Size -= (uint64_t)-Offset; 883 Offset = 0; 884 } 885 886 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 887 888 // Clamp the end offset to the end of the allocation. Note that this is 889 // formulated to handle even the case where "BeginOffset + Size" overflows. 890 assert(AllocSize >= BeginOffset); // Established above. 891 if (Size > AllocSize - BeginOffset) 892 EndOffset = AllocSize; 893 894 // NB: This only works if we have zero overlapping partitions. 895 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 896 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 897 B = llvm::prior(B); 898 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 899 ++I) { 900 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 901 std::min(I->EndOffset, EndOffset), U); 902 P.use_push_back(I, NewPU); 903 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 904 P.PHIOrSelectOpMap[U] 905 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 906 } 907 } 908 909 void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 910 uint64_t Size = TD.getTypeStoreSize(Ty); 911 912 // If this memory access can be shown to *statically* extend outside the 913 // bounds of of the allocation, it's behavior is undefined, so simply 914 // ignore it. Note that this is more strict than the generic clamping 915 // behavior of insertUse. 916 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 917 Size > (AllocSize - (uint64_t)Offset)) 918 return markAsDead(I); 919 920 insertUse(I, Offset, Size); 921 } 922 923 void visitBitCastInst(BitCastInst &BC) { 924 if (BC.use_empty()) 925 return markAsDead(BC); 926 927 enqueueUsers(BC, Offset); 928 } 929 930 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 931 if (GEPI.use_empty()) 932 return markAsDead(GEPI); 933 934 int64_t GEPOffset; 935 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 936 llvm_unreachable("Unable to compute constant offset for use"); 937 938 enqueueUsers(GEPI, GEPOffset); 939 } 940 941 void visitLoadInst(LoadInst &LI) { 942 handleLoadOrStore(LI.getType(), LI, Offset); 943 } 944 945 void visitStoreInst(StoreInst &SI) { 946 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 947 } 948 949 void visitMemSetInst(MemSetInst &II) { 950 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 951 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 952 insertUse(II, Offset, Size); 953 } 954 955 void visitMemTransferInst(MemTransferInst &II) { 956 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 957 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 958 if (!Size) 959 return markAsDead(II); 960 961 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 962 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 963 Offsets.DestBegin == Offsets.SourceBegin) 964 return markAsDead(II); // Skip identity transfers without side-effects. 965 966 insertUse(II, Offset, Size); 967 } 968 969 void visitIntrinsicInst(IntrinsicInst &II) { 970 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 971 II.getIntrinsicID() == Intrinsic::lifetime_end); 972 973 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 974 insertUse(II, Offset, 975 std::min(AllocSize - Offset, Length->getLimitedValue())); 976 } 977 978 void insertPHIOrSelect(Instruction &User, uint64_t Offset) { 979 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 980 981 // For PHI and select operands outside the alloca, we can't nuke the entire 982 // phi or select -- the other side might still be relevant, so we special 983 // case them here and use a separate structure to track the operands 984 // themselves which should be replaced with undef. 985 if (Offset >= AllocSize) { 986 P.DeadOperands.push_back(U); 987 return; 988 } 989 990 insertUse(User, Offset, Size); 991 } 992 void visitPHINode(PHINode &PN) { 993 if (PN.use_empty()) 994 return markAsDead(PN); 995 996 insertPHIOrSelect(PN, Offset); 997 } 998 void visitSelectInst(SelectInst &SI) { 999 if (SI.use_empty()) 1000 return markAsDead(SI); 1001 1002 if (Value *Result = foldSelectInst(SI)) { 1003 if (Result == *U) 1004 // If the result of the constant fold will be the pointer, recurse 1005 // through the select as if we had RAUW'ed it. 1006 enqueueUsers(SI, Offset); 1007 else 1008 // Otherwise the operand to the select is dead, and we can replace it 1009 // with undef. 1010 P.DeadOperands.push_back(U); 1011 1012 return; 1013 } 1014 1015 insertPHIOrSelect(SI, Offset); 1016 } 1017 1018 /// \brief Unreachable, we've already visited the alloca once. 1019 void visitInstruction(Instruction &I) { 1020 llvm_unreachable("Unhandled instruction in use builder."); 1021 } 1022}; 1023 1024void AllocaPartitioning::splitAndMergePartitions() { 1025 size_t NumDeadPartitions = 0; 1026 1027 // Track the range of splittable partitions that we pass when accumulating 1028 // overlapping unsplittable partitions. 1029 uint64_t SplitEndOffset = 0ull; 1030 1031 Partition New(0ull, 0ull, false); 1032 1033 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 1034 ++j; 1035 1036 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 1037 assert(New.BeginOffset == New.EndOffset); 1038 New = Partitions[i]; 1039 } else { 1040 assert(New.IsSplittable); 1041 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 1042 } 1043 assert(New.BeginOffset != New.EndOffset); 1044 1045 // Scan the overlapping partitions. 1046 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 1047 // If the new partition we are forming is splittable, stop at the first 1048 // unsplittable partition. 1049 if (New.IsSplittable && !Partitions[j].IsSplittable) 1050 break; 1051 1052 // Grow the new partition to include any equally splittable range. 'j' is 1053 // always equally splittable when New is splittable, but when New is not 1054 // splittable, we may subsume some (or part of some) splitable partition 1055 // without growing the new one. 1056 if (New.IsSplittable == Partitions[j].IsSplittable) { 1057 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 1058 } else { 1059 assert(!New.IsSplittable); 1060 assert(Partitions[j].IsSplittable); 1061 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 1062 } 1063 1064 Partitions[j].kill(); 1065 ++NumDeadPartitions; 1066 ++j; 1067 } 1068 1069 // If the new partition is splittable, chop off the end as soon as the 1070 // unsplittable subsequent partition starts and ensure we eventually cover 1071 // the splittable area. 1072 if (j != e && New.IsSplittable) { 1073 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 1074 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1075 } 1076 1077 // Add the new partition if it differs from the original one and is 1078 // non-empty. We can end up with an empty partition here if it was 1079 // splittable but there is an unsplittable one that starts at the same 1080 // offset. 1081 if (New != Partitions[i]) { 1082 if (New.BeginOffset != New.EndOffset) 1083 Partitions.push_back(New); 1084 // Mark the old one for removal. 1085 Partitions[i].kill(); 1086 ++NumDeadPartitions; 1087 } 1088 1089 New.BeginOffset = New.EndOffset; 1090 if (!New.IsSplittable) { 1091 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1092 if (j != e && !Partitions[j].IsSplittable) 1093 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1094 New.IsSplittable = true; 1095 // If there is a trailing splittable partition which won't be fused into 1096 // the next splittable partition go ahead and add it onto the partitions 1097 // list. 1098 if (New.BeginOffset < New.EndOffset && 1099 (j == e || !Partitions[j].IsSplittable || 1100 New.EndOffset < Partitions[j].BeginOffset)) { 1101 Partitions.push_back(New); 1102 New.BeginOffset = New.EndOffset = 0ull; 1103 } 1104 } 1105 } 1106 1107 // Re-sort the partitions now that they have been split and merged into 1108 // disjoint set of partitions. Also remove any of the dead partitions we've 1109 // replaced in the process. 1110 std::sort(Partitions.begin(), Partitions.end()); 1111 if (NumDeadPartitions) { 1112 assert(Partitions.back().isDead()); 1113 assert((ptrdiff_t)NumDeadPartitions == 1114 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1115 } 1116 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1117} 1118 1119AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1120 : 1121#ifndef NDEBUG 1122 AI(AI), 1123#endif 1124 PointerEscapingInstr(0) { 1125 PartitionBuilder PB(TD, AI, *this); 1126 if (!PB()) 1127 return; 1128 1129 // Sort the uses. This arranges for the offsets to be in ascending order, 1130 // and the sizes to be in descending order. 1131 std::sort(Partitions.begin(), Partitions.end()); 1132 1133 // Remove any partitions from the back which are marked as dead. 1134 while (!Partitions.empty() && Partitions.back().isDead()) 1135 Partitions.pop_back(); 1136 1137 if (Partitions.size() > 1) { 1138 // Intersect splittability for all partitions with equal offsets and sizes. 1139 // Then remove all but the first so that we have a sequence of non-equal but 1140 // potentially overlapping partitions. 1141 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1142 I = J) { 1143 ++J; 1144 while (J != E && *I == *J) { 1145 I->IsSplittable &= J->IsSplittable; 1146 ++J; 1147 } 1148 } 1149 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1150 Partitions.end()); 1151 1152 // Split splittable and merge unsplittable partitions into a disjoint set 1153 // of partitions over the used space of the allocation. 1154 splitAndMergePartitions(); 1155 } 1156 1157 // Now build up the user lists for each of these disjoint partitions by 1158 // re-walking the recursive users of the alloca. 1159 Uses.resize(Partitions.size()); 1160 UseBuilder UB(TD, AI, *this); 1161 UB(); 1162} 1163 1164Type *AllocaPartitioning::getCommonType(iterator I) const { 1165 Type *Ty = 0; 1166 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1167 if (!UI->U) 1168 continue; // Skip dead uses. 1169 if (isa<IntrinsicInst>(*UI->U->getUser())) 1170 continue; 1171 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1172 continue; 1173 1174 Type *UserTy = 0; 1175 if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { 1176 UserTy = LI->getType(); 1177 } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { 1178 UserTy = SI->getValueOperand()->getType(); 1179 } 1180 1181 if (Ty && Ty != UserTy) 1182 return 0; 1183 1184 Ty = UserTy; 1185 } 1186 return Ty; 1187} 1188 1189#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1190 1191void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1192 StringRef Indent) const { 1193 OS << Indent << "partition #" << (I - begin()) 1194 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1195 << (I->IsSplittable ? " (splittable)" : "") 1196 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1197 << "\n"; 1198} 1199 1200void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1201 StringRef Indent) const { 1202 for (const_use_iterator UI = use_begin(I), UE = use_end(I); 1203 UI != UE; ++UI) { 1204 if (!UI->U) 1205 continue; // Skip dead uses. 1206 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1207 << "used by: " << *UI->U->getUser() << "\n"; 1208 if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { 1209 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1210 bool IsDest; 1211 if (!MTO.IsSplittable) 1212 IsDest = UI->BeginOffset == MTO.DestBegin; 1213 else 1214 IsDest = MTO.DestBegin != 0u; 1215 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1216 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1217 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1218 } 1219 } 1220} 1221 1222void AllocaPartitioning::print(raw_ostream &OS) const { 1223 if (PointerEscapingInstr) { 1224 OS << "No partitioning for alloca: " << AI << "\n" 1225 << " A pointer to this alloca escaped by:\n" 1226 << " " << *PointerEscapingInstr << "\n"; 1227 return; 1228 } 1229 1230 OS << "Partitioning of alloca: " << AI << "\n"; 1231 unsigned Num = 0; 1232 for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { 1233 print(OS, I); 1234 printUsers(OS, I); 1235 } 1236} 1237 1238void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1239void AllocaPartitioning::dump() const { print(dbgs()); } 1240 1241#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1242 1243 1244namespace { 1245/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1246/// 1247/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1248/// the loads and stores of an alloca instruction, as well as updating its 1249/// debug information. This is used when a domtree is unavailable and thus 1250/// mem2reg in its full form can't be used to handle promotion of allocas to 1251/// scalar values. 1252class AllocaPromoter : public LoadAndStorePromoter { 1253 AllocaInst &AI; 1254 DIBuilder &DIB; 1255 1256 SmallVector<DbgDeclareInst *, 4> DDIs; 1257 SmallVector<DbgValueInst *, 4> DVIs; 1258 1259public: 1260 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1261 AllocaInst &AI, DIBuilder &DIB) 1262 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1263 1264 void run(const SmallVectorImpl<Instruction*> &Insts) { 1265 // Remember which alloca we're promoting (for isInstInList). 1266 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1267 for (Value::use_iterator UI = DebugNode->use_begin(), 1268 UE = DebugNode->use_end(); 1269 UI != UE; ++UI) 1270 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1271 DDIs.push_back(DDI); 1272 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1273 DVIs.push_back(DVI); 1274 } 1275 1276 LoadAndStorePromoter::run(Insts); 1277 AI.eraseFromParent(); 1278 while (!DDIs.empty()) 1279 DDIs.pop_back_val()->eraseFromParent(); 1280 while (!DVIs.empty()) 1281 DVIs.pop_back_val()->eraseFromParent(); 1282 } 1283 1284 virtual bool isInstInList(Instruction *I, 1285 const SmallVectorImpl<Instruction*> &Insts) const { 1286 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1287 return LI->getOperand(0) == &AI; 1288 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1289 } 1290 1291 virtual void updateDebugInfo(Instruction *Inst) const { 1292 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1293 E = DDIs.end(); I != E; ++I) { 1294 DbgDeclareInst *DDI = *I; 1295 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1296 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1297 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1298 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1299 } 1300 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1301 E = DVIs.end(); I != E; ++I) { 1302 DbgValueInst *DVI = *I; 1303 Value *Arg = NULL; 1304 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1305 // If an argument is zero extended then use argument directly. The ZExt 1306 // may be zapped by an optimization pass in future. 1307 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1308 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1309 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1310 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1311 if (!Arg) 1312 Arg = SI->getOperand(0); 1313 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1314 Arg = LI->getOperand(0); 1315 } else { 1316 continue; 1317 } 1318 Instruction *DbgVal = 1319 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1320 Inst); 1321 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1322 } 1323 } 1324}; 1325} // end anon namespace 1326 1327 1328namespace { 1329/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1330/// 1331/// This pass takes allocations which can be completely analyzed (that is, they 1332/// don't escape) and tries to turn them into scalar SSA values. There are 1333/// a few steps to this process. 1334/// 1335/// 1) It takes allocations of aggregates and analyzes the ways in which they 1336/// are used to try to split them into smaller allocations, ideally of 1337/// a single scalar data type. It will split up memcpy and memset accesses 1338/// as necessary and try to isolate invidual scalar accesses. 1339/// 2) It will transform accesses into forms which are suitable for SSA value 1340/// promotion. This can be replacing a memset with a scalar store of an 1341/// integer value, or it can involve speculating operations on a PHI or 1342/// select to be a PHI or select of the results. 1343/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1344/// onto insert and extract operations on a vector value, and convert them to 1345/// this form. By doing so, it will enable promotion of vector aggregates to 1346/// SSA vector values. 1347class SROA : public FunctionPass { 1348 const bool RequiresDomTree; 1349 1350 LLVMContext *C; 1351 const DataLayout *TD; 1352 DominatorTree *DT; 1353 1354 /// \brief Worklist of alloca instructions to simplify. 1355 /// 1356 /// Each alloca in the function is added to this. Each new alloca formed gets 1357 /// added to it as well to recursively simplify unless that alloca can be 1358 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1359 /// the one being actively rewritten, we add it back onto the list if not 1360 /// already present to ensure it is re-visited. 1361 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1362 1363 /// \brief A collection of instructions to delete. 1364 /// We try to batch deletions to simplify code and make things a bit more 1365 /// efficient. 1366 SmallVector<Instruction *, 8> DeadInsts; 1367 1368 /// \brief A set to prevent repeatedly marking an instruction split into many 1369 /// uses as dead. Only used to guard insertion into DeadInsts. 1370 SmallPtrSet<Instruction *, 4> DeadSplitInsts; 1371 1372 /// \brief Post-promotion worklist. 1373 /// 1374 /// Sometimes we discover an alloca which has a high probability of becoming 1375 /// viable for SROA after a round of promotion takes place. In those cases, 1376 /// the alloca is enqueued here for re-processing. 1377 /// 1378 /// Note that we have to be very careful to clear allocas out of this list in 1379 /// the event they are deleted. 1380 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1381 1382 /// \brief A collection of alloca instructions we can directly promote. 1383 std::vector<AllocaInst *> PromotableAllocas; 1384 1385public: 1386 SROA(bool RequiresDomTree = true) 1387 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1388 C(0), TD(0), DT(0) { 1389 initializeSROAPass(*PassRegistry::getPassRegistry()); 1390 } 1391 bool runOnFunction(Function &F); 1392 void getAnalysisUsage(AnalysisUsage &AU) const; 1393 1394 const char *getPassName() const { return "SROA"; } 1395 static char ID; 1396 1397private: 1398 friend class PHIOrSelectSpeculator; 1399 friend class AllocaPartitionRewriter; 1400 friend class AllocaPartitionVectorRewriter; 1401 1402 bool rewriteAllocaPartition(AllocaInst &AI, 1403 AllocaPartitioning &P, 1404 AllocaPartitioning::iterator PI); 1405 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1406 bool runOnAlloca(AllocaInst &AI); 1407 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1408 bool promoteAllocas(Function &F); 1409}; 1410} 1411 1412char SROA::ID = 0; 1413 1414FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1415 return new SROA(RequiresDomTree); 1416} 1417 1418INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1419 false, false) 1420INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1421INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1422 false, false) 1423 1424namespace { 1425/// \brief Visitor to speculate PHIs and Selects where possible. 1426class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1427 // Befriend the base class so it can delegate to private visit methods. 1428 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1429 1430 const DataLayout &TD; 1431 AllocaPartitioning &P; 1432 SROA &Pass; 1433 1434public: 1435 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1436 : TD(TD), P(P), Pass(Pass) {} 1437 1438 /// \brief Visit the users of an alloca partition and rewrite them. 1439 void visitUsers(AllocaPartitioning::const_iterator PI) { 1440 // Note that we need to use an index here as the underlying vector of uses 1441 // may be grown during speculation. However, we never need to re-visit the 1442 // new uses, and so we can use the initial size bound. 1443 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1444 const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); 1445 if (!PU.U) 1446 continue; // Skip dead use. 1447 1448 visit(cast<Instruction>(PU.U->getUser())); 1449 } 1450 } 1451 1452private: 1453 // By default, skip this instruction. 1454 void visitInstruction(Instruction &I) {} 1455 1456 /// PHI instructions that use an alloca and are subsequently loaded can be 1457 /// rewritten to load both input pointers in the pred blocks and then PHI the 1458 /// results, allowing the load of the alloca to be promoted. 1459 /// From this: 1460 /// %P2 = phi [i32* %Alloca, i32* %Other] 1461 /// %V = load i32* %P2 1462 /// to: 1463 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1464 /// ... 1465 /// %V2 = load i32* %Other 1466 /// ... 1467 /// %V = phi [i32 %V1, i32 %V2] 1468 /// 1469 /// We can do this to a select if its only uses are loads and if the operands 1470 /// to the select can be loaded unconditionally. 1471 /// 1472 /// FIXME: This should be hoisted into a generic utility, likely in 1473 /// Transforms/Util/Local.h 1474 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1475 // For now, we can only do this promotion if the load is in the same block 1476 // as the PHI, and if there are no stores between the phi and load. 1477 // TODO: Allow recursive phi users. 1478 // TODO: Allow stores. 1479 BasicBlock *BB = PN.getParent(); 1480 unsigned MaxAlign = 0; 1481 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1482 UI != UE; ++UI) { 1483 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1484 if (LI == 0 || !LI->isSimple()) return false; 1485 1486 // For now we only allow loads in the same block as the PHI. This is 1487 // a common case that happens when instcombine merges two loads through 1488 // a PHI. 1489 if (LI->getParent() != BB) return false; 1490 1491 // Ensure that there are no instructions between the PHI and the load that 1492 // could store. 1493 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1494 if (BBI->mayWriteToMemory()) 1495 return false; 1496 1497 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1498 Loads.push_back(LI); 1499 } 1500 1501 // We can only transform this if it is safe to push the loads into the 1502 // predecessor blocks. The only thing to watch out for is that we can't put 1503 // a possibly trapping load in the predecessor if it is a critical edge. 1504 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; 1505 ++Idx) { 1506 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1507 Value *InVal = PN.getIncomingValue(Idx); 1508 1509 // If the value is produced by the terminator of the predecessor (an 1510 // invoke) or it has side-effects, there is no valid place to put a load 1511 // in the predecessor. 1512 if (TI == InVal || TI->mayHaveSideEffects()) 1513 return false; 1514 1515 // If the predecessor has a single successor, then the edge isn't 1516 // critical. 1517 if (TI->getNumSuccessors() == 1) 1518 continue; 1519 1520 // If this pointer is always safe to load, or if we can prove that there 1521 // is already a load in the block, then we can move the load to the pred 1522 // block. 1523 if (InVal->isDereferenceablePointer() || 1524 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1525 continue; 1526 1527 return false; 1528 } 1529 1530 return true; 1531 } 1532 1533 void visitPHINode(PHINode &PN) { 1534 DEBUG(dbgs() << " original: " << PN << "\n"); 1535 1536 SmallVector<LoadInst *, 4> Loads; 1537 if (!isSafePHIToSpeculate(PN, Loads)) 1538 return; 1539 1540 assert(!Loads.empty()); 1541 1542 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1543 IRBuilder<> PHIBuilder(&PN); 1544 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1545 PN.getName() + ".sroa.speculated"); 1546 1547 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1548 // matter which one we get and if any differ, it doesn't matter. 1549 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1550 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1551 unsigned Align = SomeLoad->getAlignment(); 1552 1553 // Rewrite all loads of the PN to use the new PHI. 1554 do { 1555 LoadInst *LI = Loads.pop_back_val(); 1556 LI->replaceAllUsesWith(NewPN); 1557 Pass.DeadInsts.push_back(LI); 1558 } while (!Loads.empty()); 1559 1560 // Inject loads into all of the pred blocks. 1561 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1562 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1563 TerminatorInst *TI = Pred->getTerminator(); 1564 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1565 Value *InVal = PN.getIncomingValue(Idx); 1566 IRBuilder<> PredBuilder(TI); 1567 1568 LoadInst *Load 1569 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1570 Pred->getName())); 1571 ++NumLoadsSpeculated; 1572 Load->setAlignment(Align); 1573 if (TBAATag) 1574 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1575 NewPN->addIncoming(Load, Pred); 1576 1577 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1578 if (!Ptr) 1579 // No uses to rewrite. 1580 continue; 1581 1582 // Try to lookup and rewrite any partition uses corresponding to this phi 1583 // input. 1584 AllocaPartitioning::iterator PI 1585 = P.findPartitionForPHIOrSelectOperand(InUse); 1586 if (PI == P.end()) 1587 continue; 1588 1589 // Replace the Use in the PartitionUse for this operand with the Use 1590 // inside the load. 1591 AllocaPartitioning::use_iterator UI 1592 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1593 assert(isa<PHINode>(*UI->U->getUser())); 1594 UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); 1595 } 1596 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1597 } 1598 1599 /// Select instructions that use an alloca and are subsequently loaded can be 1600 /// rewritten to load both input pointers and then select between the result, 1601 /// allowing the load of the alloca to be promoted. 1602 /// From this: 1603 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1604 /// %V = load i32* %P2 1605 /// to: 1606 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1607 /// %V2 = load i32* %Other 1608 /// %V = select i1 %cond, i32 %V1, i32 %V2 1609 /// 1610 /// We can do this to a select if its only uses are loads and if the operand 1611 /// to the select can be loaded unconditionally. 1612 bool isSafeSelectToSpeculate(SelectInst &SI, 1613 SmallVectorImpl<LoadInst *> &Loads) { 1614 Value *TValue = SI.getTrueValue(); 1615 Value *FValue = SI.getFalseValue(); 1616 bool TDerefable = TValue->isDereferenceablePointer(); 1617 bool FDerefable = FValue->isDereferenceablePointer(); 1618 1619 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1620 UI != UE; ++UI) { 1621 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1622 if (LI == 0 || !LI->isSimple()) return false; 1623 1624 // Both operands to the select need to be dereferencable, either 1625 // absolutely (e.g. allocas) or at this point because we can see other 1626 // accesses to it. 1627 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1628 LI->getAlignment(), &TD)) 1629 return false; 1630 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1631 LI->getAlignment(), &TD)) 1632 return false; 1633 Loads.push_back(LI); 1634 } 1635 1636 return true; 1637 } 1638 1639 void visitSelectInst(SelectInst &SI) { 1640 DEBUG(dbgs() << " original: " << SI << "\n"); 1641 IRBuilder<> IRB(&SI); 1642 1643 // If the select isn't safe to speculate, just use simple logic to emit it. 1644 SmallVector<LoadInst *, 4> Loads; 1645 if (!isSafeSelectToSpeculate(SI, Loads)) 1646 return; 1647 1648 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1649 AllocaPartitioning::iterator PIs[2]; 1650 AllocaPartitioning::PartitionUse PUs[2]; 1651 for (unsigned i = 0, e = 2; i != e; ++i) { 1652 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1653 if (PIs[i] != P.end()) { 1654 // If the pointer is within the partitioning, remove the select from 1655 // its uses. We'll add in the new loads below. 1656 AllocaPartitioning::use_iterator UI 1657 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1658 PUs[i] = *UI; 1659 // Clear out the use here so that the offsets into the use list remain 1660 // stable but this use is ignored when rewriting. 1661 UI->U = 0; 1662 } 1663 } 1664 1665 Value *TV = SI.getTrueValue(); 1666 Value *FV = SI.getFalseValue(); 1667 // Replace the loads of the select with a select of two loads. 1668 while (!Loads.empty()) { 1669 LoadInst *LI = Loads.pop_back_val(); 1670 1671 IRB.SetInsertPoint(LI); 1672 LoadInst *TL = 1673 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1674 LoadInst *FL = 1675 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1676 NumLoadsSpeculated += 2; 1677 1678 // Transfer alignment and TBAA info if present. 1679 TL->setAlignment(LI->getAlignment()); 1680 FL->setAlignment(LI->getAlignment()); 1681 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1682 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1683 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1684 } 1685 1686 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1687 LI->getName() + ".sroa.speculated"); 1688 1689 LoadInst *Loads[2] = { TL, FL }; 1690 for (unsigned i = 0, e = 2; i != e; ++i) { 1691 if (PIs[i] != P.end()) { 1692 Use *LoadUse = &Loads[i]->getOperandUse(0); 1693 assert(PUs[i].U->get() == LoadUse->get()); 1694 PUs[i].U = LoadUse; 1695 P.use_push_back(PIs[i], PUs[i]); 1696 } 1697 } 1698 1699 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1700 LI->replaceAllUsesWith(V); 1701 Pass.DeadInsts.push_back(LI); 1702 } 1703 } 1704}; 1705} 1706 1707/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. 1708/// 1709/// If the provided GEP is all-constant, the total byte offset formed by the 1710/// GEP is computed and Offset is set to it. If the GEP has any non-constant 1711/// operands, the function returns false and the value of Offset is unmodified. 1712static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP, 1713 APInt &Offset) { 1714 APInt GEPOffset(Offset.getBitWidth(), 0); 1715 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 1716 GTI != GTE; ++GTI) { 1717 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1718 if (!OpC) 1719 return false; 1720 if (OpC->isZero()) continue; 1721 1722 // Handle a struct index, which adds its field offset to the pointer. 1723 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 1724 unsigned ElementIdx = OpC->getZExtValue(); 1725 const StructLayout *SL = TD.getStructLayout(STy); 1726 GEPOffset += APInt(Offset.getBitWidth(), 1727 SL->getElementOffset(ElementIdx)); 1728 continue; 1729 } 1730 1731 APInt TypeSize(Offset.getBitWidth(), 1732 TD.getTypeAllocSize(GTI.getIndexedType())); 1733 if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) { 1734 assert((VTy->getScalarSizeInBits() % 8) == 0 && 1735 "vector element size is not a multiple of 8, cannot GEP over it"); 1736 TypeSize = VTy->getScalarSizeInBits() / 8; 1737 } 1738 1739 GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; 1740 } 1741 Offset = GEPOffset; 1742 return true; 1743} 1744 1745/// \brief Build a GEP out of a base pointer and indices. 1746/// 1747/// This will return the BasePtr if that is valid, or build a new GEP 1748/// instruction using the IRBuilder if GEP-ing is needed. 1749static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1750 SmallVectorImpl<Value *> &Indices, 1751 const Twine &Prefix) { 1752 if (Indices.empty()) 1753 return BasePtr; 1754 1755 // A single zero index is a no-op, so check for this and avoid building a GEP 1756 // in that case. 1757 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1758 return BasePtr; 1759 1760 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1761} 1762 1763/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1764/// TargetTy without changing the offset of the pointer. 1765/// 1766/// This routine assumes we've already established a properly offset GEP with 1767/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1768/// zero-indices down through type layers until we find one the same as 1769/// TargetTy. If we can't find one with the same type, we at least try to use 1770/// one with the same size. If none of that works, we just produce the GEP as 1771/// indicated by Indices to have the correct offset. 1772static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, 1773 Value *BasePtr, Type *Ty, Type *TargetTy, 1774 SmallVectorImpl<Value *> &Indices, 1775 const Twine &Prefix) { 1776 if (Ty == TargetTy) 1777 return buildGEP(IRB, BasePtr, Indices, Prefix); 1778 1779 // See if we can descend into a struct and locate a field with the correct 1780 // type. 1781 unsigned NumLayers = 0; 1782 Type *ElementTy = Ty; 1783 do { 1784 if (ElementTy->isPointerTy()) 1785 break; 1786 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1787 ElementTy = SeqTy->getElementType(); 1788 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits( 1789 ElementTy->isPointerTy() ? 1790 cast<PointerType>(ElementTy)->getAddressSpace(): 0), 0))); 1791 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1792 if (STy->element_begin() == STy->element_end()) 1793 break; // Nothing left to descend into. 1794 ElementTy = *STy->element_begin(); 1795 Indices.push_back(IRB.getInt32(0)); 1796 } else { 1797 break; 1798 } 1799 ++NumLayers; 1800 } while (ElementTy != TargetTy); 1801 if (ElementTy != TargetTy) 1802 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1803 1804 return buildGEP(IRB, BasePtr, Indices, Prefix); 1805} 1806 1807/// \brief Recursively compute indices for a natural GEP. 1808/// 1809/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1810/// element types adding appropriate indices for the GEP. 1811static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, 1812 Value *Ptr, Type *Ty, APInt &Offset, 1813 Type *TargetTy, 1814 SmallVectorImpl<Value *> &Indices, 1815 const Twine &Prefix) { 1816 if (Offset == 0) 1817 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1818 1819 // We can't recurse through pointer types. 1820 if (Ty->isPointerTy()) 1821 return 0; 1822 1823 // We try to analyze GEPs over vectors here, but note that these GEPs are 1824 // extremely poorly defined currently. The long-term goal is to remove GEPing 1825 // over a vector from the IR completely. 1826 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1827 unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); 1828 if (ElementSizeInBits % 8) 1829 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1830 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1831 APInt NumSkippedElements = Offset.udiv(ElementSize); 1832 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1833 return 0; 1834 Offset -= NumSkippedElements * ElementSize; 1835 Indices.push_back(IRB.getInt(NumSkippedElements)); 1836 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1837 Offset, TargetTy, Indices, Prefix); 1838 } 1839 1840 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1841 Type *ElementTy = ArrTy->getElementType(); 1842 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1843 APInt NumSkippedElements = Offset.udiv(ElementSize); 1844 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1845 return 0; 1846 1847 Offset -= NumSkippedElements * ElementSize; 1848 Indices.push_back(IRB.getInt(NumSkippedElements)); 1849 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1850 Indices, Prefix); 1851 } 1852 1853 StructType *STy = dyn_cast<StructType>(Ty); 1854 if (!STy) 1855 return 0; 1856 1857 const StructLayout *SL = TD.getStructLayout(STy); 1858 uint64_t StructOffset = Offset.getZExtValue(); 1859 if (StructOffset >= SL->getSizeInBytes()) 1860 return 0; 1861 unsigned Index = SL->getElementContainingOffset(StructOffset); 1862 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1863 Type *ElementTy = STy->getElementType(Index); 1864 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1865 return 0; // The offset points into alignment padding. 1866 1867 Indices.push_back(IRB.getInt32(Index)); 1868 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1869 Indices, Prefix); 1870} 1871 1872/// \brief Get a natural GEP from a base pointer to a particular offset and 1873/// resulting in a particular type. 1874/// 1875/// The goal is to produce a "natural" looking GEP that works with the existing 1876/// composite types to arrive at the appropriate offset and element type for 1877/// a pointer. TargetTy is the element type the returned GEP should point-to if 1878/// possible. We recurse by decreasing Offset, adding the appropriate index to 1879/// Indices, and setting Ty to the result subtype. 1880/// 1881/// If no natural GEP can be constructed, this function returns null. 1882static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, 1883 Value *Ptr, APInt Offset, Type *TargetTy, 1884 SmallVectorImpl<Value *> &Indices, 1885 const Twine &Prefix) { 1886 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1887 1888 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1889 // an i8. 1890 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1891 return 0; 1892 1893 Type *ElementTy = Ty->getElementType(); 1894 if (!ElementTy->isSized()) 1895 return 0; // We can't GEP through an unsized element. 1896 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1897 if (ElementSize == 0) 1898 return 0; // Zero-length arrays can't help us build a natural GEP. 1899 APInt NumSkippedElements = Offset.udiv(ElementSize); 1900 1901 Offset -= NumSkippedElements * ElementSize; 1902 Indices.push_back(IRB.getInt(NumSkippedElements)); 1903 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1904 Indices, Prefix); 1905} 1906 1907/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1908/// resulting pointer has PointerTy. 1909/// 1910/// This tries very hard to compute a "natural" GEP which arrives at the offset 1911/// and produces the pointer type desired. Where it cannot, it will try to use 1912/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1913/// fails, it will try to use an existing i8* and GEP to the byte offset and 1914/// bitcast to the type. 1915/// 1916/// The strategy for finding the more natural GEPs is to peel off layers of the 1917/// pointer, walking back through bit casts and GEPs, searching for a base 1918/// pointer from which we can compute a natural GEP with the desired 1919/// properities. The algorithm tries to fold as many constant indices into 1920/// a single GEP as possible, thus making each GEP more independent of the 1921/// surrounding code. 1922static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, 1923 Value *Ptr, APInt Offset, Type *PointerTy, 1924 const Twine &Prefix) { 1925 // Even though we don't look through PHI nodes, we could be called on an 1926 // instruction in an unreachable block, which may be on a cycle. 1927 SmallPtrSet<Value *, 4> Visited; 1928 Visited.insert(Ptr); 1929 SmallVector<Value *, 4> Indices; 1930 1931 // We may end up computing an offset pointer that has the wrong type. If we 1932 // never are able to compute one directly that has the correct type, we'll 1933 // fall back to it, so keep it around here. 1934 Value *OffsetPtr = 0; 1935 1936 // Remember any i8 pointer we come across to re-use if we need to do a raw 1937 // byte offset. 1938 Value *Int8Ptr = 0; 1939 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1940 1941 Type *TargetTy = PointerTy->getPointerElementType(); 1942 1943 do { 1944 // First fold any existing GEPs into the offset. 1945 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1946 APInt GEPOffset(Offset.getBitWidth(), 0); 1947 if (!accumulateGEPOffsets(TD, *GEP, GEPOffset)) 1948 break; 1949 Offset += GEPOffset; 1950 Ptr = GEP->getPointerOperand(); 1951 if (!Visited.insert(Ptr)) 1952 break; 1953 } 1954 1955 // See if we can perform a natural GEP here. 1956 Indices.clear(); 1957 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1958 Indices, Prefix)) { 1959 if (P->getType() == PointerTy) { 1960 // Zap any offset pointer that we ended up computing in previous rounds. 1961 if (OffsetPtr && OffsetPtr->use_empty()) 1962 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1963 I->eraseFromParent(); 1964 return P; 1965 } 1966 if (!OffsetPtr) { 1967 OffsetPtr = P; 1968 } 1969 } 1970 1971 // Stash this pointer if we've found an i8*. 1972 if (Ptr->getType()->isIntegerTy(8)) { 1973 Int8Ptr = Ptr; 1974 Int8PtrOffset = Offset; 1975 } 1976 1977 // Peel off a layer of the pointer and update the offset appropriately. 1978 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1979 Ptr = cast<Operator>(Ptr)->getOperand(0); 1980 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1981 if (GA->mayBeOverridden()) 1982 break; 1983 Ptr = GA->getAliasee(); 1984 } else { 1985 break; 1986 } 1987 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1988 } while (Visited.insert(Ptr)); 1989 1990 if (!OffsetPtr) { 1991 if (!Int8Ptr) { 1992 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1993 Prefix + ".raw_cast"); 1994 Int8PtrOffset = Offset; 1995 } 1996 1997 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1998 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1999 Prefix + ".raw_idx"); 2000 } 2001 Ptr = OffsetPtr; 2002 2003 // On the off chance we were targeting i8*, guard the bitcast here. 2004 if (Ptr->getType() != PointerTy) 2005 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 2006 2007 return Ptr; 2008} 2009 2010/// \brief Test whether we can convert a value from the old to the new type. 2011/// 2012/// This predicate should be used to guard calls to convertValue in order to 2013/// ensure that we only try to convert viable values. The strategy is that we 2014/// will peel off single element struct and array wrappings to get to an 2015/// underlying value, and convert that value. 2016static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 2017 if (OldTy == NewTy) 2018 return true; 2019 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 2020 return false; 2021 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 2022 return false; 2023 2024 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 2025 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 2026 return true; 2027 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 2028 return true; 2029 return false; 2030 } 2031 2032 return true; 2033} 2034 2035/// \brief Generic routine to convert an SSA value to a value of a different 2036/// type. 2037/// 2038/// This will try various different casting techniques, such as bitcasts, 2039/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 2040/// two types for viability with this routine. 2041static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2042 Type *Ty) { 2043 assert(canConvertValue(DL, V->getType(), Ty) && 2044 "Value not convertable to type"); 2045 if (V->getType() == Ty) 2046 return V; 2047 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 2048 return IRB.CreateIntToPtr(V, Ty); 2049 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 2050 return IRB.CreatePtrToInt(V, Ty); 2051 2052 return IRB.CreateBitCast(V, Ty); 2053} 2054 2055/// \brief Test whether the given alloca partition can be promoted to a vector. 2056/// 2057/// This is a quick test to check whether we can rewrite a particular alloca 2058/// partition (and its newly formed alloca) into a vector alloca with only 2059/// whole-vector loads and stores such that it could be promoted to a vector 2060/// SSA value. We only can ensure this for a limited set of operations, and we 2061/// don't want to do the rewrites unless we are confident that the result will 2062/// be promotable, so we have an early test here. 2063static bool isVectorPromotionViable(const DataLayout &TD, 2064 Type *AllocaTy, 2065 AllocaPartitioning &P, 2066 uint64_t PartitionBeginOffset, 2067 uint64_t PartitionEndOffset, 2068 AllocaPartitioning::const_use_iterator I, 2069 AllocaPartitioning::const_use_iterator E) { 2070 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 2071 if (!Ty) 2072 return false; 2073 2074 uint64_t VecSize = TD.getTypeSizeInBits(Ty); 2075 uint64_t ElementSize = Ty->getScalarSizeInBits(); 2076 2077 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2078 // that aren't byte sized. 2079 if (ElementSize % 8) 2080 return false; 2081 assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); 2082 VecSize /= 8; 2083 ElementSize /= 8; 2084 2085 for (; I != E; ++I) { 2086 if (!I->U) 2087 continue; // Skip dead use. 2088 2089 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 2090 uint64_t BeginIndex = BeginOffset / ElementSize; 2091 if (BeginIndex * ElementSize != BeginOffset || 2092 BeginIndex >= Ty->getNumElements()) 2093 return false; 2094 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 2095 uint64_t EndIndex = EndOffset / ElementSize; 2096 if (EndIndex * ElementSize != EndOffset || 2097 EndIndex > Ty->getNumElements()) 2098 return false; 2099 2100 // FIXME: We should build shuffle vector instructions to handle 2101 // non-element-sized accesses. 2102 if ((EndOffset - BeginOffset) != ElementSize && 2103 (EndOffset - BeginOffset) != VecSize) 2104 return false; 2105 2106 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2107 if (MI->isVolatile()) 2108 return false; 2109 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2110 const AllocaPartitioning::MemTransferOffsets &MTO 2111 = P.getMemTransferOffsets(*MTI); 2112 if (!MTO.IsSplittable) 2113 return false; 2114 } 2115 } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { 2116 // Disable vector promotion when there are loads or stores of an FCA. 2117 return false; 2118 } else if (!isa<LoadInst>(I->U->getUser()) && 2119 !isa<StoreInst>(I->U->getUser())) { 2120 return false; 2121 } 2122 } 2123 return true; 2124} 2125 2126/// \brief Test whether the given alloca partition's integer operations can be 2127/// widened to promotable ones. 2128/// 2129/// This is a quick test to check whether we can rewrite the integer loads and 2130/// stores to a particular alloca into wider loads and stores and be able to 2131/// promote the resulting alloca. 2132static bool isIntegerWideningViable(const DataLayout &TD, 2133 Type *AllocaTy, 2134 uint64_t AllocBeginOffset, 2135 AllocaPartitioning &P, 2136 AllocaPartitioning::const_use_iterator I, 2137 AllocaPartitioning::const_use_iterator E) { 2138 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2139 2140 // Don't try to handle allocas with bit-padding. 2141 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2142 return false; 2143 2144 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2145 2146 // Check the uses to ensure the uses are (likely) promoteable integer uses. 2147 // Also ensure that the alloca has a covering load or store. We don't want 2148 // to widen the integer operotains only to fail to promote due to some other 2149 // unsplittable entry (which we may make splittable later). 2150 bool WholeAllocaOp = false; 2151 for (; I != E; ++I) { 2152 if (!I->U) 2153 continue; // Skip dead use. 2154 2155 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2156 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2157 2158 // We can't reasonably handle cases where the load or store extends past 2159 // the end of the aloca's type and into its padding. 2160 if (RelEnd > Size) 2161 return false; 2162 2163 if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2164 if (LI->isVolatile()) 2165 return false; 2166 if (RelBegin == 0 && RelEnd == Size) 2167 WholeAllocaOp = true; 2168 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2169 if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) 2170 return false; 2171 continue; 2172 } 2173 // Non-integer loads need to be convertible from the alloca type so that 2174 // they are promotable. 2175 if (RelBegin != 0 || RelEnd != Size || 2176 !canConvertValue(TD, AllocaTy, LI->getType())) 2177 return false; 2178 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2179 Type *ValueTy = SI->getValueOperand()->getType(); 2180 if (SI->isVolatile()) 2181 return false; 2182 if (RelBegin == 0 && RelEnd == Size) 2183 WholeAllocaOp = true; 2184 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2185 if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) 2186 return false; 2187 continue; 2188 } 2189 // Non-integer stores need to be convertible to the alloca type so that 2190 // they are promotable. 2191 if (RelBegin != 0 || RelEnd != Size || 2192 !canConvertValue(TD, ValueTy, AllocaTy)) 2193 return false; 2194 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2195 if (MI->isVolatile()) 2196 return false; 2197 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2198 const AllocaPartitioning::MemTransferOffsets &MTO 2199 = P.getMemTransferOffsets(*MTI); 2200 if (!MTO.IsSplittable) 2201 return false; 2202 } 2203 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { 2204 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2205 II->getIntrinsicID() != Intrinsic::lifetime_end) 2206 return false; 2207 } else { 2208 return false; 2209 } 2210 } 2211 return WholeAllocaOp; 2212} 2213 2214namespace { 2215/// \brief Visitor to rewrite instructions using a partition of an alloca to 2216/// use a new alloca. 2217/// 2218/// Also implements the rewriting to vector-based accesses when the partition 2219/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2220/// lives here. 2221class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2222 bool> { 2223 // Befriend the base class so it can delegate to private visit methods. 2224 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2225 2226 const DataLayout &TD; 2227 AllocaPartitioning &P; 2228 SROA &Pass; 2229 AllocaInst &OldAI, &NewAI; 2230 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2231 Type *NewAllocaTy; 2232 2233 // If we are rewriting an alloca partition which can be written as pure 2234 // vector operations, we stash extra information here. When VecTy is 2235 // non-null, we have some strict guarantees about the rewriten alloca: 2236 // - The new alloca is exactly the size of the vector type here. 2237 // - The accesses all either map to the entire vector or to a single 2238 // element. 2239 // - The set of accessing instructions is only one of those handled above 2240 // in isVectorPromotionViable. Generally these are the same access kinds 2241 // which are promotable via mem2reg. 2242 VectorType *VecTy; 2243 Type *ElementTy; 2244 uint64_t ElementSize; 2245 2246 // This is a convenience and flag variable that will be null unless the new 2247 // alloca's integer operations should be widened to this integer type due to 2248 // passing isIntegerWideningViable above. If it is non-null, the desired 2249 // integer type will be stored here for easy access during rewriting. 2250 IntegerType *IntTy; 2251 2252 // The offset of the partition user currently being rewritten. 2253 uint64_t BeginOffset, EndOffset; 2254 Use *OldUse; 2255 Instruction *OldPtr; 2256 2257 // The name prefix to use when rewriting instructions for this alloca. 2258 std::string NamePrefix; 2259 2260public: 2261 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2262 AllocaPartitioning::iterator PI, 2263 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2264 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2265 : TD(TD), P(P), Pass(Pass), 2266 OldAI(OldAI), NewAI(NewAI), 2267 NewAllocaBeginOffset(NewBeginOffset), 2268 NewAllocaEndOffset(NewEndOffset), 2269 NewAllocaTy(NewAI.getAllocatedType()), 2270 VecTy(), ElementTy(), ElementSize(), IntTy(), 2271 BeginOffset(), EndOffset() { 2272 } 2273 2274 /// \brief Visit the users of the alloca partition and rewrite them. 2275 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2276 AllocaPartitioning::const_use_iterator E) { 2277 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2278 NewAllocaBeginOffset, NewAllocaEndOffset, 2279 I, E)) { 2280 ++NumVectorized; 2281 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2282 ElementTy = VecTy->getElementType(); 2283 assert((VecTy->getScalarSizeInBits() % 8) == 0 && 2284 "Only multiple-of-8 sized vector elements are viable"); 2285 ElementSize = VecTy->getScalarSizeInBits() / 8; 2286 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2287 NewAllocaBeginOffset, P, I, E)) { 2288 IntTy = Type::getIntNTy(NewAI.getContext(), 2289 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2290 } 2291 bool CanSROA = true; 2292 for (; I != E; ++I) { 2293 if (!I->U) 2294 continue; // Skip dead uses. 2295 BeginOffset = I->BeginOffset; 2296 EndOffset = I->EndOffset; 2297 OldUse = I->U; 2298 OldPtr = cast<Instruction>(I->U->get()); 2299 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2300 CanSROA &= visit(cast<Instruction>(I->U->getUser())); 2301 } 2302 if (VecTy) { 2303 assert(CanSROA); 2304 VecTy = 0; 2305 ElementTy = 0; 2306 ElementSize = 0; 2307 } 2308 if (IntTy) { 2309 assert(CanSROA); 2310 IntTy = 0; 2311 } 2312 return CanSROA; 2313 } 2314 2315private: 2316 // Every instruction which can end up as a user must have a rewrite rule. 2317 bool visitInstruction(Instruction &I) { 2318 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2319 llvm_unreachable("No rewrite rule for this instruction!"); 2320 } 2321 2322 Twine getName(const Twine &Suffix) { 2323 return NamePrefix + Suffix; 2324 } 2325 2326 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 2327 assert(BeginOffset >= NewAllocaBeginOffset); 2328 unsigned AS = cast<PointerType>(PointerTy)->getAddressSpace(); 2329 APInt Offset(TD.getPointerSizeInBits(AS), BeginOffset - NewAllocaBeginOffset); 2330 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2331 } 2332 2333 /// \brief Compute suitable alignment to access an offset into the new alloca. 2334 unsigned getOffsetAlign(uint64_t Offset) { 2335 unsigned NewAIAlign = NewAI.getAlignment(); 2336 if (!NewAIAlign) 2337 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2338 return MinAlign(NewAIAlign, Offset); 2339 } 2340 2341 /// \brief Compute suitable alignment to access this partition of the new 2342 /// alloca. 2343 unsigned getPartitionAlign() { 2344 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2345 } 2346 2347 /// \brief Compute suitable alignment to access a type at an offset of the 2348 /// new alloca. 2349 /// 2350 /// \returns zero if the type's ABI alignment is a suitable alignment, 2351 /// otherwise returns the maximal suitable alignment. 2352 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2353 unsigned Align = getOffsetAlign(Offset); 2354 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2355 } 2356 2357 /// \brief Compute suitable alignment to access a type at the beginning of 2358 /// this partition of the new alloca. 2359 /// 2360 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2361 unsigned getPartitionTypeAlign(Type *Ty) { 2362 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2363 } 2364 2365 ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { 2366 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2367 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2368 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2369 uint32_t Index = RelOffset / ElementSize; 2370 assert(Index * ElementSize == RelOffset); 2371 return IRB.getInt32(Index); 2372 } 2373 2374 Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, 2375 uint64_t Offset) { 2376 assert(IntTy && "We cannot extract an integer from the alloca"); 2377 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2378 getName(".load")); 2379 V = convertValue(TD, IRB, V, IntTy); 2380 assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); 2381 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2382 assert(TD.getTypeStoreSize(TargetTy) + RelOffset <= 2383 TD.getTypeStoreSize(IntTy) && 2384 "Element load outside of alloca store"); 2385 uint64_t ShAmt = 8*RelOffset; 2386 if (TD.isBigEndian()) 2387 ShAmt = 8*(TD.getTypeStoreSize(IntTy) - 2388 TD.getTypeStoreSize(TargetTy) - RelOffset); 2389 if (ShAmt) 2390 V = IRB.CreateLShr(V, ShAmt, getName(".shift")); 2391 assert(TargetTy->getBitWidth() <= IntTy->getBitWidth() && 2392 "Cannot extract to a larger integer!"); 2393 if (TargetTy != IntTy) 2394 V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); 2395 return V; 2396 } 2397 2398 StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { 2399 IntegerType *Ty = cast<IntegerType>(V->getType()); 2400 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2401 "Cannot insert a larger integer!"); 2402 if (Ty != IntTy) 2403 V = IRB.CreateZExt(V, IntTy, getName(".ext")); 2404 assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); 2405 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2406 assert(TD.getTypeStoreSize(Ty) + RelOffset <= 2407 TD.getTypeStoreSize(IntTy) && 2408 "Element store outside of alloca store"); 2409 uint64_t ShAmt = 8*RelOffset; 2410 if (TD.isBigEndian()) 2411 ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(Ty) 2412 - RelOffset); 2413 if (ShAmt) 2414 V = IRB.CreateShl(V, ShAmt, getName(".shift")); 2415 2416 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2417 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2418 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2419 getName(".oldload")); 2420 Old = convertValue(TD, IRB, Old, IntTy); 2421 Old = IRB.CreateAnd(Old, Mask, getName(".mask")); 2422 V = IRB.CreateOr(Old, V, getName(".insert")); 2423 } 2424 V = convertValue(TD, IRB, V, NewAllocaTy); 2425 return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2426 } 2427 2428 void deleteIfTriviallyDead(Value *V) { 2429 Instruction *I = cast<Instruction>(V); 2430 if (isInstructionTriviallyDead(I)) 2431 Pass.DeadInsts.push_back(I); 2432 } 2433 2434 bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { 2435 Value *Result; 2436 if (LI.getType() == VecTy->getElementType() || 2437 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 2438 Result = IRB.CreateExtractElement( 2439 IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), 2440 getIndex(IRB, BeginOffset), getName(".extract")); 2441 } else { 2442 Result = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2443 getName(".load")); 2444 } 2445 if (Result->getType() != LI.getType()) 2446 Result = convertValue(TD, IRB, Result, LI.getType()); 2447 LI.replaceAllUsesWith(Result); 2448 Pass.DeadInsts.push_back(&LI); 2449 2450 DEBUG(dbgs() << " to: " << *Result << "\n"); 2451 return true; 2452 } 2453 2454 bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 2455 assert(!LI.isVolatile()); 2456 Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()), 2457 BeginOffset); 2458 LI.replaceAllUsesWith(Result); 2459 Pass.DeadInsts.push_back(&LI); 2460 DEBUG(dbgs() << " to: " << *Result << "\n"); 2461 return true; 2462 } 2463 2464 bool visitLoadInst(LoadInst &LI) { 2465 DEBUG(dbgs() << " original: " << LI << "\n"); 2466 Value *OldOp = LI.getOperand(0); 2467 assert(OldOp == OldPtr); 2468 IRBuilder<> IRB(&LI); 2469 2470 if (VecTy) 2471 return rewriteVectorizedLoadInst(IRB, LI, OldOp); 2472 if (IntTy && LI.getType()->isIntegerTy()) 2473 return rewriteIntegerLoad(IRB, LI); 2474 2475 if (BeginOffset == NewAllocaBeginOffset && 2476 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2477 Value *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2478 LI.isVolatile(), getName(".load")); 2479 Value *NewV = convertValue(TD, IRB, NewLI, LI.getType()); 2480 LI.replaceAllUsesWith(NewV); 2481 Pass.DeadInsts.push_back(&LI); 2482 2483 DEBUG(dbgs() << " to: " << *NewLI << "\n"); 2484 return !LI.isVolatile(); 2485 } 2486 2487 assert(!IntTy && "Invalid load found with int-op widening enabled"); 2488 2489 Value *NewPtr = getAdjustedAllocaPtr(IRB, 2490 LI.getPointerOperand()->getType()); 2491 LI.setOperand(0, NewPtr); 2492 LI.setAlignment(getPartitionTypeAlign(LI.getType())); 2493 DEBUG(dbgs() << " to: " << LI << "\n"); 2494 2495 deleteIfTriviallyDead(OldOp); 2496 return NewPtr == &NewAI && !LI.isVolatile(); 2497 } 2498 2499 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI, 2500 Value *OldOp) { 2501 Value *V = SI.getValueOperand(); 2502 if (V->getType() == ElementTy || 2503 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 2504 if (V->getType() != ElementTy) 2505 V = convertValue(TD, IRB, V, ElementTy); 2506 LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2507 getName(".load")); 2508 V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset), 2509 getName(".insert")); 2510 } else if (V->getType() != VecTy) { 2511 V = convertValue(TD, IRB, V, VecTy); 2512 } 2513 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2514 Pass.DeadInsts.push_back(&SI); 2515 2516 (void)Store; 2517 DEBUG(dbgs() << " to: " << *Store << "\n"); 2518 return true; 2519 } 2520 2521 bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { 2522 assert(!SI.isVolatile()); 2523 StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset); 2524 Pass.DeadInsts.push_back(&SI); 2525 (void)Store; 2526 DEBUG(dbgs() << " to: " << *Store << "\n"); 2527 return true; 2528 } 2529 2530 bool visitStoreInst(StoreInst &SI) { 2531 DEBUG(dbgs() << " original: " << SI << "\n"); 2532 Value *OldOp = SI.getOperand(1); 2533 assert(OldOp == OldPtr); 2534 IRBuilder<> IRB(&SI); 2535 2536 if (VecTy) 2537 return rewriteVectorizedStoreInst(IRB, SI, OldOp); 2538 Type *ValueTy = SI.getValueOperand()->getType(); 2539 if (IntTy && ValueTy->isIntegerTy()) 2540 return rewriteIntegerStore(IRB, SI); 2541 2542 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2543 // alloca that should be re-examined after promoting this alloca. 2544 if (ValueTy->isPointerTy()) 2545 if (AllocaInst *AI = dyn_cast<AllocaInst>(SI.getValueOperand() 2546 ->stripInBoundsOffsets())) 2547 Pass.PostPromotionWorklist.insert(AI); 2548 2549 if (BeginOffset == NewAllocaBeginOffset && 2550 canConvertValue(TD, ValueTy, NewAllocaTy)) { 2551 Value *NewV = convertValue(TD, IRB, SI.getValueOperand(), NewAllocaTy); 2552 StoreInst *NewSI = IRB.CreateAlignedStore(NewV, &NewAI, NewAI.getAlignment(), 2553 SI.isVolatile()); 2554 (void)NewSI; 2555 Pass.DeadInsts.push_back(&SI); 2556 2557 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2558 return !SI.isVolatile(); 2559 } 2560 2561 assert(!IntTy && "Invalid store found with int-op widening enabled"); 2562 2563 Value *NewPtr = getAdjustedAllocaPtr(IRB, 2564 SI.getPointerOperand()->getType()); 2565 SI.setOperand(1, NewPtr); 2566 SI.setAlignment(getPartitionTypeAlign(SI.getValueOperand()->getType())); 2567 DEBUG(dbgs() << " to: " << SI << "\n"); 2568 2569 deleteIfTriviallyDead(OldOp); 2570 return NewPtr == &NewAI && !SI.isVolatile(); 2571 } 2572 2573 bool visitMemSetInst(MemSetInst &II) { 2574 DEBUG(dbgs() << " original: " << II << "\n"); 2575 IRBuilder<> IRB(&II); 2576 assert(II.getRawDest() == OldPtr); 2577 2578 // If the memset has a variable size, it cannot be split, just adjust the 2579 // pointer to the new alloca. 2580 if (!isa<Constant>(II.getLength())) { 2581 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2582 Type *CstTy = II.getAlignmentCst()->getType(); 2583 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2584 2585 deleteIfTriviallyDead(OldPtr); 2586 return false; 2587 } 2588 2589 // Record this instruction for deletion. 2590 if (Pass.DeadSplitInsts.insert(&II)) 2591 Pass.DeadInsts.push_back(&II); 2592 2593 Type *AllocaTy = NewAI.getAllocatedType(); 2594 Type *ScalarTy = AllocaTy->getScalarType(); 2595 2596 // If this doesn't map cleanly onto the alloca type, and that type isn't 2597 // a single value type, just emit a memset. 2598 if (!VecTy && !IntTy && 2599 (BeginOffset != NewAllocaBeginOffset || 2600 EndOffset != NewAllocaEndOffset || 2601 !AllocaTy->isSingleValueType() || 2602 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { 2603 Type *SizeTy = II.getLength()->getType(); 2604 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2605 CallInst *New 2606 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2607 II.getRawDest()->getType()), 2608 II.getValue(), Size, getPartitionAlign(), 2609 II.isVolatile()); 2610 (void)New; 2611 DEBUG(dbgs() << " to: " << *New << "\n"); 2612 return false; 2613 } 2614 2615 // If we can represent this as a simple value, we have to build the actual 2616 // value to store, which requires expanding the byte present in memset to 2617 // a sensible representation for the alloca type. This is essentially 2618 // splatting the byte to a sufficiently wide integer, bitcasting to the 2619 // desired scalar type, and splatting it across any desired vector type. 2620 uint64_t Size = EndOffset - BeginOffset; 2621 Value *V = II.getValue(); 2622 IntegerType *VTy = cast<IntegerType>(V->getType()); 2623 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2624 if (Size*8 > VTy->getBitWidth()) 2625 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2626 ConstantExpr::getUDiv( 2627 Constant::getAllOnesValue(SplatIntTy), 2628 ConstantExpr::getZExt( 2629 Constant::getAllOnesValue(V->getType()), 2630 SplatIntTy)), 2631 getName(".isplat")); 2632 2633 // If this is an element-wide memset of a vectorizable alloca, insert it. 2634 if (VecTy && (BeginOffset > NewAllocaBeginOffset || 2635 EndOffset < NewAllocaEndOffset)) { 2636 if (V->getType() != ScalarTy) 2637 V = convertValue(TD, IRB, V, ScalarTy); 2638 StoreInst *Store = IRB.CreateAlignedStore( 2639 IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, 2640 NewAI.getAlignment(), 2641 getName(".load")), 2642 V, getIndex(IRB, BeginOffset), 2643 getName(".insert")), 2644 &NewAI, NewAI.getAlignment()); 2645 (void)Store; 2646 DEBUG(dbgs() << " to: " << *Store << "\n"); 2647 return true; 2648 } 2649 2650 // If this is a memset on an alloca where we can widen stores, insert the 2651 // set integer. 2652 if (IntTy && (BeginOffset > NewAllocaBeginOffset || 2653 EndOffset < NewAllocaEndOffset)) { 2654 assert(!II.isVolatile()); 2655 StoreInst *Store = insertInteger(IRB, V, BeginOffset); 2656 (void)Store; 2657 DEBUG(dbgs() << " to: " << *Store << "\n"); 2658 return true; 2659 } 2660 2661 if (V->getType() != AllocaTy) 2662 V = convertValue(TD, IRB, V, AllocaTy); 2663 2664 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2665 II.isVolatile()); 2666 (void)New; 2667 DEBUG(dbgs() << " to: " << *New << "\n"); 2668 return !II.isVolatile(); 2669 } 2670 2671 bool visitMemTransferInst(MemTransferInst &II) { 2672 // Rewriting of memory transfer instructions can be a bit tricky. We break 2673 // them into two categories: split intrinsics and unsplit intrinsics. 2674 2675 DEBUG(dbgs() << " original: " << II << "\n"); 2676 IRBuilder<> IRB(&II); 2677 2678 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2679 bool IsDest = II.getRawDest() == OldPtr; 2680 2681 const AllocaPartitioning::MemTransferOffsets &MTO 2682 = P.getMemTransferOffsets(II); 2683 2684 assert(OldPtr->getType()->isPointerTy() && "Must be a pointer type!"); 2685 unsigned AS = cast<PointerType>(OldPtr->getType())->getAddressSpace(); 2686 // Compute the relative offset within the transfer. 2687 unsigned IntPtrWidth = TD.getPointerSizeInBits(AS); 2688 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2689 : MTO.SourceBegin)); 2690 2691 unsigned Align = II.getAlignment(); 2692 if (Align > 1) 2693 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2694 MinAlign(II.getAlignment(), getPartitionAlign())); 2695 2696 // For unsplit intrinsics, we simply modify the source and destination 2697 // pointers in place. This isn't just an optimization, it is a matter of 2698 // correctness. With unsplit intrinsics we may be dealing with transfers 2699 // within a single alloca before SROA ran, or with transfers that have 2700 // a variable length. We may also be dealing with memmove instead of 2701 // memcpy, and so simply updating the pointers is the necessary for us to 2702 // update both source and dest of a single call. 2703 if (!MTO.IsSplittable) { 2704 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2705 if (IsDest) 2706 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2707 else 2708 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2709 2710 Type *CstTy = II.getAlignmentCst()->getType(); 2711 II.setAlignment(ConstantInt::get(CstTy, Align)); 2712 2713 DEBUG(dbgs() << " to: " << II << "\n"); 2714 deleteIfTriviallyDead(OldOp); 2715 return false; 2716 } 2717 // For split transfer intrinsics we have an incredibly useful assurance: 2718 // the source and destination do not reside within the same alloca, and at 2719 // least one of them does not escape. This means that we can replace 2720 // memmove with memcpy, and we don't need to worry about all manner of 2721 // downsides to splitting and transforming the operations. 2722 2723 // If this doesn't map cleanly onto the alloca type, and that type isn't 2724 // a single value type, just emit a memcpy. 2725 bool EmitMemCpy 2726 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2727 EndOffset != NewAllocaEndOffset || 2728 !NewAI.getAllocatedType()->isSingleValueType()); 2729 2730 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2731 // size hasn't been shrunk based on analysis of the viable range, this is 2732 // a no-op. 2733 if (EmitMemCpy && &OldAI == &NewAI) { 2734 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2735 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2736 // Ensure the start lines up. 2737 assert(BeginOffset == OrigBegin); 2738 (void)OrigBegin; 2739 2740 // Rewrite the size as needed. 2741 if (EndOffset != OrigEnd) 2742 II.setLength(ConstantInt::get(II.getLength()->getType(), 2743 EndOffset - BeginOffset)); 2744 return false; 2745 } 2746 // Record this instruction for deletion. 2747 if (Pass.DeadSplitInsts.insert(&II)) 2748 Pass.DeadInsts.push_back(&II); 2749 2750 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2751 EndOffset == NewAllocaEndOffset; 2752 bool IsVectorElement = VecTy && !IsWholeAlloca; 2753 uint64_t Size = EndOffset - BeginOffset; 2754 IntegerType *SubIntTy 2755 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2756 2757 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2758 : II.getRawDest()->getType(); 2759 if (!EmitMemCpy) { 2760 if (IsVectorElement) 2761 OtherPtrTy = VecTy->getElementType()->getPointerTo(); 2762 else if (IntTy && !IsWholeAlloca) 2763 OtherPtrTy = SubIntTy->getPointerTo(); 2764 else 2765 OtherPtrTy = NewAI.getType(); 2766 } 2767 2768 // Compute the other pointer, folding as much as possible to produce 2769 // a single, simple GEP in most cases. 2770 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2771 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2772 getName("." + OtherPtr->getName())); 2773 2774 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2775 // alloca that should be re-examined after rewriting this instruction. 2776 if (AllocaInst *AI 2777 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2778 Pass.Worklist.insert(AI); 2779 2780 if (EmitMemCpy) { 2781 Value *OurPtr 2782 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2783 : II.getRawSource()->getType()); 2784 Type *SizeTy = II.getLength()->getType(); 2785 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2786 2787 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2788 IsDest ? OtherPtr : OurPtr, 2789 Size, Align, II.isVolatile()); 2790 (void)New; 2791 DEBUG(dbgs() << " to: " << *New << "\n"); 2792 return false; 2793 } 2794 2795 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2796 // is equivalent to 1, but that isn't true if we end up rewriting this as 2797 // a load or store. 2798 if (!Align) 2799 Align = 1; 2800 2801 Value *SrcPtr = OtherPtr; 2802 Value *DstPtr = &NewAI; 2803 if (!IsDest) 2804 std::swap(SrcPtr, DstPtr); 2805 2806 Value *Src; 2807 if (IsVectorElement && !IsDest) { 2808 // We have to extract rather than load. 2809 Src = IRB.CreateExtractElement( 2810 IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), 2811 getIndex(IRB, BeginOffset), 2812 getName(".copyextract")); 2813 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2814 Src = extractInteger(IRB, SubIntTy, BeginOffset); 2815 } else { 2816 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2817 getName(".copyload")); 2818 } 2819 2820 if (IntTy && !IsWholeAlloca && IsDest) { 2821 StoreInst *Store = insertInteger(IRB, Src, BeginOffset); 2822 (void)Store; 2823 DEBUG(dbgs() << " to: " << *Store << "\n"); 2824 return true; 2825 } 2826 2827 if (IsVectorElement && IsDest) { 2828 // We have to insert into a loaded copy before storing. 2829 Src = IRB.CreateInsertElement( 2830 IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), 2831 Src, getIndex(IRB, BeginOffset), 2832 getName(".insert")); 2833 } 2834 2835 StoreInst *Store = cast<StoreInst>( 2836 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2837 (void)Store; 2838 DEBUG(dbgs() << " to: " << *Store << "\n"); 2839 return !II.isVolatile(); 2840 } 2841 2842 bool visitIntrinsicInst(IntrinsicInst &II) { 2843 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2844 II.getIntrinsicID() == Intrinsic::lifetime_end); 2845 DEBUG(dbgs() << " original: " << II << "\n"); 2846 IRBuilder<> IRB(&II); 2847 assert(II.getArgOperand(1) == OldPtr); 2848 2849 // Record this instruction for deletion. 2850 if (Pass.DeadSplitInsts.insert(&II)) 2851 Pass.DeadInsts.push_back(&II); 2852 2853 ConstantInt *Size 2854 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2855 EndOffset - BeginOffset); 2856 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2857 Value *New; 2858 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2859 New = IRB.CreateLifetimeStart(Ptr, Size); 2860 else 2861 New = IRB.CreateLifetimeEnd(Ptr, Size); 2862 2863 DEBUG(dbgs() << " to: " << *New << "\n"); 2864 return true; 2865 } 2866 2867 bool visitPHINode(PHINode &PN) { 2868 DEBUG(dbgs() << " original: " << PN << "\n"); 2869 2870 // We would like to compute a new pointer in only one place, but have it be 2871 // as local as possible to the PHI. To do that, we re-use the location of 2872 // the old pointer, which necessarily must be in the right position to 2873 // dominate the PHI. 2874 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2875 2876 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2877 // Replace the operands which were using the old pointer. 2878 User::op_iterator OI = PN.op_begin(), OE = PN.op_end(); 2879 for (; OI != OE; ++OI) 2880 if (*OI == OldPtr) 2881 *OI = NewPtr; 2882 2883 DEBUG(dbgs() << " to: " << PN << "\n"); 2884 deleteIfTriviallyDead(OldPtr); 2885 return false; 2886 } 2887 2888 bool visitSelectInst(SelectInst &SI) { 2889 DEBUG(dbgs() << " original: " << SI << "\n"); 2890 IRBuilder<> IRB(&SI); 2891 2892 // Find the operand we need to rewrite here. 2893 bool IsTrueVal = SI.getTrueValue() == OldPtr; 2894 if (IsTrueVal) 2895 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 2896 else 2897 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 2898 2899 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 2900 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 2901 DEBUG(dbgs() << " to: " << SI << "\n"); 2902 deleteIfTriviallyDead(OldPtr); 2903 return false; 2904 } 2905 2906}; 2907} 2908 2909namespace { 2910/// \brief Visitor to rewrite aggregate loads and stores as scalar. 2911/// 2912/// This pass aggressively rewrites all aggregate loads and stores on 2913/// a particular pointer (or any pointer derived from it which we can identify) 2914/// with scalar loads and stores. 2915class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 2916 // Befriend the base class so it can delegate to private visit methods. 2917 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 2918 2919 const DataLayout &TD; 2920 2921 /// Queue of pointer uses to analyze and potentially rewrite. 2922 SmallVector<Use *, 8> Queue; 2923 2924 /// Set to prevent us from cycling with phi nodes and loops. 2925 SmallPtrSet<User *, 8> Visited; 2926 2927 /// The current pointer use being rewritten. This is used to dig up the used 2928 /// value (as opposed to the user). 2929 Use *U; 2930 2931public: 2932 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 2933 2934 /// Rewrite loads and stores through a pointer and all pointers derived from 2935 /// it. 2936 bool rewrite(Instruction &I) { 2937 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 2938 enqueueUsers(I); 2939 bool Changed = false; 2940 while (!Queue.empty()) { 2941 U = Queue.pop_back_val(); 2942 Changed |= visit(cast<Instruction>(U->getUser())); 2943 } 2944 return Changed; 2945 } 2946 2947private: 2948 /// Enqueue all the users of the given instruction for further processing. 2949 /// This uses a set to de-duplicate users. 2950 void enqueueUsers(Instruction &I) { 2951 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 2952 ++UI) 2953 if (Visited.insert(*UI)) 2954 Queue.push_back(&UI.getUse()); 2955 } 2956 2957 // Conservative default is to not rewrite anything. 2958 bool visitInstruction(Instruction &I) { return false; } 2959 2960 /// \brief Generic recursive split emission class. 2961 template <typename Derived> 2962 class OpSplitter { 2963 protected: 2964 /// The builder used to form new instructions. 2965 IRBuilder<> IRB; 2966 /// The indices which to be used with insert- or extractvalue to select the 2967 /// appropriate value within the aggregate. 2968 SmallVector<unsigned, 4> Indices; 2969 /// The indices to a GEP instruction which will move Ptr to the correct slot 2970 /// within the aggregate. 2971 SmallVector<Value *, 4> GEPIndices; 2972 /// The base pointer of the original op, used as a base for GEPing the 2973 /// split operations. 2974 Value *Ptr; 2975 2976 /// Initialize the splitter with an insertion point, Ptr and start with a 2977 /// single zero GEP index. 2978 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 2979 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 2980 2981 public: 2982 /// \brief Generic recursive split emission routine. 2983 /// 2984 /// This method recursively splits an aggregate op (load or store) into 2985 /// scalar or vector ops. It splits recursively until it hits a single value 2986 /// and emits that single value operation via the template argument. 2987 /// 2988 /// The logic of this routine relies on GEPs and insertvalue and 2989 /// extractvalue all operating with the same fundamental index list, merely 2990 /// formatted differently (GEPs need actual values). 2991 /// 2992 /// \param Ty The type being split recursively into smaller ops. 2993 /// \param Agg The aggregate value being built up or stored, depending on 2994 /// whether this is splitting a load or a store respectively. 2995 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 2996 if (Ty->isSingleValueType()) 2997 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 2998 2999 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3000 unsigned OldSize = Indices.size(); 3001 (void)OldSize; 3002 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3003 ++Idx) { 3004 assert(Indices.size() == OldSize && "Did not return to the old size"); 3005 Indices.push_back(Idx); 3006 GEPIndices.push_back(IRB.getInt32(Idx)); 3007 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3008 GEPIndices.pop_back(); 3009 Indices.pop_back(); 3010 } 3011 return; 3012 } 3013 3014 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3015 unsigned OldSize = Indices.size(); 3016 (void)OldSize; 3017 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3018 ++Idx) { 3019 assert(Indices.size() == OldSize && "Did not return to the old size"); 3020 Indices.push_back(Idx); 3021 GEPIndices.push_back(IRB.getInt32(Idx)); 3022 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3023 GEPIndices.pop_back(); 3024 Indices.pop_back(); 3025 } 3026 return; 3027 } 3028 3029 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3030 } 3031 }; 3032 3033 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3034 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3035 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3036 3037 /// Emit a leaf load of a single value. This is called at the leaves of the 3038 /// recursive emission to actually load values. 3039 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3040 assert(Ty->isSingleValueType()); 3041 // Load the single value and insert it using the indices. 3042 Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, 3043 Name + ".gep"), 3044 Name + ".load"); 3045 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3046 DEBUG(dbgs() << " to: " << *Load << "\n"); 3047 } 3048 }; 3049 3050 bool visitLoadInst(LoadInst &LI) { 3051 assert(LI.getPointerOperand() == *U); 3052 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3053 return false; 3054 3055 // We have an aggregate being loaded, split it apart. 3056 DEBUG(dbgs() << " original: " << LI << "\n"); 3057 LoadOpSplitter Splitter(&LI, *U); 3058 Value *V = UndefValue::get(LI.getType()); 3059 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3060 LI.replaceAllUsesWith(V); 3061 LI.eraseFromParent(); 3062 return true; 3063 } 3064 3065 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3066 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3067 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3068 3069 /// Emit a leaf store of a single value. This is called at the leaves of the 3070 /// recursive emission to actually produce stores. 3071 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3072 assert(Ty->isSingleValueType()); 3073 // Extract the single value and store it using the indices. 3074 Value *Store = IRB.CreateStore( 3075 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3076 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3077 (void)Store; 3078 DEBUG(dbgs() << " to: " << *Store << "\n"); 3079 } 3080 }; 3081 3082 bool visitStoreInst(StoreInst &SI) { 3083 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3084 return false; 3085 Value *V = SI.getValueOperand(); 3086 if (V->getType()->isSingleValueType()) 3087 return false; 3088 3089 // We have an aggregate being stored, split it apart. 3090 DEBUG(dbgs() << " original: " << SI << "\n"); 3091 StoreOpSplitter Splitter(&SI, *U); 3092 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3093 SI.eraseFromParent(); 3094 return true; 3095 } 3096 3097 bool visitBitCastInst(BitCastInst &BC) { 3098 enqueueUsers(BC); 3099 return false; 3100 } 3101 3102 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3103 enqueueUsers(GEPI); 3104 return false; 3105 } 3106 3107 bool visitPHINode(PHINode &PN) { 3108 enqueueUsers(PN); 3109 return false; 3110 } 3111 3112 bool visitSelectInst(SelectInst &SI) { 3113 enqueueUsers(SI); 3114 return false; 3115 } 3116}; 3117} 3118 3119/// \brief Strip aggregate type wrapping. 3120/// 3121/// This removes no-op aggregate types wrapping an underlying type. It will 3122/// strip as many layers of types as it can without changing either the type 3123/// size or the allocated size. 3124static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3125 if (Ty->isSingleValueType()) 3126 return Ty; 3127 3128 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3129 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3130 3131 Type *InnerTy; 3132 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3133 InnerTy = ArrTy->getElementType(); 3134 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3135 const StructLayout *SL = DL.getStructLayout(STy); 3136 unsigned Index = SL->getElementContainingOffset(0); 3137 InnerTy = STy->getElementType(Index); 3138 } else { 3139 return Ty; 3140 } 3141 3142 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3143 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3144 return Ty; 3145 3146 return stripAggregateTypeWrapping(DL, InnerTy); 3147} 3148 3149/// \brief Try to find a partition of the aggregate type passed in for a given 3150/// offset and size. 3151/// 3152/// This recurses through the aggregate type and tries to compute a subtype 3153/// based on the offset and size. When the offset and size span a sub-section 3154/// of an array, it will even compute a new array type for that sub-section, 3155/// and the same for structs. 3156/// 3157/// Note that this routine is very strict and tries to find a partition of the 3158/// type which produces the *exact* right offset and size. It is not forgiving 3159/// when the size or offset cause either end of type-based partition to be off. 3160/// Also, this is a best-effort routine. It is reasonable to give up and not 3161/// return a type if necessary. 3162static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3163 uint64_t Offset, uint64_t Size) { 3164 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3165 return stripAggregateTypeWrapping(TD, Ty); 3166 3167 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3168 // We can't partition pointers... 3169 if (SeqTy->isPointerTy()) 3170 return 0; 3171 3172 Type *ElementTy = SeqTy->getElementType(); 3173 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3174 uint64_t NumSkippedElements = Offset / ElementSize; 3175 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3176 if (NumSkippedElements >= ArrTy->getNumElements()) 3177 return 0; 3178 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3179 if (NumSkippedElements >= VecTy->getNumElements()) 3180 return 0; 3181 Offset -= NumSkippedElements * ElementSize; 3182 3183 // First check if we need to recurse. 3184 if (Offset > 0 || Size < ElementSize) { 3185 // Bail if the partition ends in a different array element. 3186 if ((Offset + Size) > ElementSize) 3187 return 0; 3188 // Recurse through the element type trying to peel off offset bytes. 3189 return getTypePartition(TD, ElementTy, Offset, Size); 3190 } 3191 assert(Offset == 0); 3192 3193 if (Size == ElementSize) 3194 return stripAggregateTypeWrapping(TD, ElementTy); 3195 assert(Size > ElementSize); 3196 uint64_t NumElements = Size / ElementSize; 3197 if (NumElements * ElementSize != Size) 3198 return 0; 3199 return ArrayType::get(ElementTy, NumElements); 3200 } 3201 3202 StructType *STy = dyn_cast<StructType>(Ty); 3203 if (!STy) 3204 return 0; 3205 3206 const StructLayout *SL = TD.getStructLayout(STy); 3207 if (Offset >= SL->getSizeInBytes()) 3208 return 0; 3209 uint64_t EndOffset = Offset + Size; 3210 if (EndOffset > SL->getSizeInBytes()) 3211 return 0; 3212 3213 unsigned Index = SL->getElementContainingOffset(Offset); 3214 Offset -= SL->getElementOffset(Index); 3215 3216 Type *ElementTy = STy->getElementType(Index); 3217 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3218 if (Offset >= ElementSize) 3219 return 0; // The offset points into alignment padding. 3220 3221 // See if any partition must be contained by the element. 3222 if (Offset > 0 || Size < ElementSize) { 3223 if ((Offset + Size) > ElementSize) 3224 return 0; 3225 return getTypePartition(TD, ElementTy, Offset, Size); 3226 } 3227 assert(Offset == 0); 3228 3229 if (Size == ElementSize) 3230 return stripAggregateTypeWrapping(TD, ElementTy); 3231 3232 StructType::element_iterator EI = STy->element_begin() + Index, 3233 EE = STy->element_end(); 3234 if (EndOffset < SL->getSizeInBytes()) { 3235 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3236 if (Index == EndIndex) 3237 return 0; // Within a single element and its padding. 3238 3239 // Don't try to form "natural" types if the elements don't line up with the 3240 // expected size. 3241 // FIXME: We could potentially recurse down through the last element in the 3242 // sub-struct to find a natural end point. 3243 if (SL->getElementOffset(EndIndex) != EndOffset) 3244 return 0; 3245 3246 assert(Index < EndIndex); 3247 EE = STy->element_begin() + EndIndex; 3248 } 3249 3250 // Try to build up a sub-structure. 3251 SmallVector<Type *, 4> ElementTys; 3252 do { 3253 ElementTys.push_back(*EI++); 3254 } while (EI != EE); 3255 StructType *SubTy = StructType::get(STy->getContext(), ElementTys, 3256 STy->isPacked()); 3257 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3258 if (Size != SubSL->getSizeInBytes()) 3259 return 0; // The sub-struct doesn't have quite the size needed. 3260 3261 return SubTy; 3262} 3263 3264/// \brief Rewrite an alloca partition's users. 3265/// 3266/// This routine drives both of the rewriting goals of the SROA pass. It tries 3267/// to rewrite uses of an alloca partition to be conducive for SSA value 3268/// promotion. If the partition needs a new, more refined alloca, this will 3269/// build that new alloca, preserving as much type information as possible, and 3270/// rewrite the uses of the old alloca to point at the new one and have the 3271/// appropriate new offsets. It also evaluates how successful the rewrite was 3272/// at enabling promotion and if it was successful queues the alloca to be 3273/// promoted. 3274bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3275 AllocaPartitioning &P, 3276 AllocaPartitioning::iterator PI) { 3277 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3278 bool IsLive = false; 3279 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3280 UE = P.use_end(PI); 3281 UI != UE && !IsLive; ++UI) 3282 if (UI->U) 3283 IsLive = true; 3284 if (!IsLive) 3285 return false; // No live uses left of this partition. 3286 3287 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3288 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3289 3290 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3291 DEBUG(dbgs() << " speculating "); 3292 DEBUG(P.print(dbgs(), PI, "")); 3293 Speculator.visitUsers(PI); 3294 3295 // Try to compute a friendly type for this partition of the alloca. This 3296 // won't always succeed, in which case we fall back to a legal integer type 3297 // or an i8 array of an appropriate size. 3298 Type *AllocaTy = 0; 3299 if (Type *PartitionTy = P.getCommonType(PI)) 3300 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3301 AllocaTy = PartitionTy; 3302 if (!AllocaTy) 3303 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3304 PI->BeginOffset, AllocaSize)) 3305 AllocaTy = PartitionTy; 3306 if ((!AllocaTy || 3307 (AllocaTy->isArrayTy() && 3308 AllocaTy->getArrayElementType()->isIntegerTy())) && 3309 TD->isLegalInteger(AllocaSize * 8)) 3310 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3311 if (!AllocaTy) 3312 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3313 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3314 3315 // Check for the case where we're going to rewrite to a new alloca of the 3316 // exact same type as the original, and with the same access offsets. In that 3317 // case, re-use the existing alloca, but still run through the rewriter to 3318 // performe phi and select speculation. 3319 AllocaInst *NewAI; 3320 if (AllocaTy == AI.getAllocatedType()) { 3321 assert(PI->BeginOffset == 0 && 3322 "Non-zero begin offset but same alloca type"); 3323 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3324 NewAI = &AI; 3325 } else { 3326 unsigned Alignment = AI.getAlignment(); 3327 if (!Alignment) { 3328 // The minimum alignment which users can rely on when the explicit 3329 // alignment is omitted or zero is that required by the ABI for this 3330 // type. 3331 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3332 } 3333 Alignment = MinAlign(Alignment, PI->BeginOffset); 3334 // If we will get at least this much alignment from the type alone, leave 3335 // the alloca's alignment unconstrained. 3336 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3337 Alignment = 0; 3338 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3339 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3340 &AI); 3341 ++NumNewAllocas; 3342 } 3343 3344 DEBUG(dbgs() << "Rewriting alloca partition " 3345 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3346 << *NewAI << "\n"); 3347 3348 // Track the high watermark of the post-promotion worklist. We will reset it 3349 // to this point if the alloca is not in fact scheduled for promotion. 3350 unsigned PPWOldSize = PostPromotionWorklist.size(); 3351 3352 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3353 PI->BeginOffset, PI->EndOffset); 3354 DEBUG(dbgs() << " rewriting "); 3355 DEBUG(P.print(dbgs(), PI, "")); 3356 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3357 if (Promotable) { 3358 DEBUG(dbgs() << " and queuing for promotion\n"); 3359 PromotableAllocas.push_back(NewAI); 3360 } else if (NewAI != &AI) { 3361 // If we can't promote the alloca, iterate on it to check for new 3362 // refinements exposed by splitting the current alloca. Don't iterate on an 3363 // alloca which didn't actually change and didn't get promoted. 3364 Worklist.insert(NewAI); 3365 } 3366 3367 // Drop any post-promotion work items if promotion didn't happen. 3368 if (!Promotable) 3369 while (PostPromotionWorklist.size() > PPWOldSize) 3370 PostPromotionWorklist.pop_back(); 3371 3372 return true; 3373} 3374 3375/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3376bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3377 bool Changed = false; 3378 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3379 ++PI) 3380 Changed |= rewriteAllocaPartition(AI, P, PI); 3381 3382 return Changed; 3383} 3384 3385/// \brief Analyze an alloca for SROA. 3386/// 3387/// This analyzes the alloca to ensure we can reason about it, builds 3388/// a partitioning of the alloca, and then hands it off to be split and 3389/// rewritten as needed. 3390bool SROA::runOnAlloca(AllocaInst &AI) { 3391 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3392 ++NumAllocasAnalyzed; 3393 3394 // Special case dead allocas, as they're trivial. 3395 if (AI.use_empty()) { 3396 AI.eraseFromParent(); 3397 return true; 3398 } 3399 3400 // Skip alloca forms that this analysis can't handle. 3401 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3402 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3403 return false; 3404 3405 bool Changed = false; 3406 3407 // First, split any FCA loads and stores touching this alloca to promote 3408 // better splitting and promotion opportunities. 3409 AggLoadStoreRewriter AggRewriter(*TD); 3410 Changed |= AggRewriter.rewrite(AI); 3411 3412 // Build the partition set using a recursive instruction-visiting builder. 3413 AllocaPartitioning P(*TD, AI); 3414 DEBUG(P.print(dbgs())); 3415 if (P.isEscaped()) 3416 return Changed; 3417 3418 // Delete all the dead users of this alloca before splitting and rewriting it. 3419 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3420 DE = P.dead_user_end(); 3421 DI != DE; ++DI) { 3422 Changed = true; 3423 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3424 DeadInsts.push_back(*DI); 3425 } 3426 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3427 DE = P.dead_op_end(); 3428 DO != DE; ++DO) { 3429 Value *OldV = **DO; 3430 // Clobber the use with an undef value. 3431 **DO = UndefValue::get(OldV->getType()); 3432 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3433 if (isInstructionTriviallyDead(OldI)) { 3434 Changed = true; 3435 DeadInsts.push_back(OldI); 3436 } 3437 } 3438 3439 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3440 if (P.begin() == P.end()) 3441 return Changed; 3442 3443 return splitAlloca(AI, P) || Changed; 3444} 3445 3446/// \brief Delete the dead instructions accumulated in this run. 3447/// 3448/// Recursively deletes the dead instructions we've accumulated. This is done 3449/// at the very end to maximize locality of the recursive delete and to 3450/// minimize the problems of invalidated instruction pointers as such pointers 3451/// are used heavily in the intermediate stages of the algorithm. 3452/// 3453/// We also record the alloca instructions deleted here so that they aren't 3454/// subsequently handed to mem2reg to promote. 3455void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3456 DeadSplitInsts.clear(); 3457 while (!DeadInsts.empty()) { 3458 Instruction *I = DeadInsts.pop_back_val(); 3459 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3460 3461 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3462 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3463 // Zero out the operand and see if it becomes trivially dead. 3464 *OI = 0; 3465 if (isInstructionTriviallyDead(U)) 3466 DeadInsts.push_back(U); 3467 } 3468 3469 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3470 DeletedAllocas.insert(AI); 3471 3472 ++NumDeleted; 3473 I->eraseFromParent(); 3474 } 3475} 3476 3477/// \brief Promote the allocas, using the best available technique. 3478/// 3479/// This attempts to promote whatever allocas have been identified as viable in 3480/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3481/// If there is a domtree available, we attempt to promote using the full power 3482/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3483/// based on the SSAUpdater utilities. This function returns whether any 3484/// promotion occured. 3485bool SROA::promoteAllocas(Function &F) { 3486 if (PromotableAllocas.empty()) 3487 return false; 3488 3489 NumPromoted += PromotableAllocas.size(); 3490 3491 if (DT && !ForceSSAUpdater) { 3492 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3493 PromoteMemToReg(PromotableAllocas, *DT); 3494 PromotableAllocas.clear(); 3495 return true; 3496 } 3497 3498 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3499 SSAUpdater SSA; 3500 DIBuilder DIB(*F.getParent()); 3501 SmallVector<Instruction*, 64> Insts; 3502 3503 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3504 AllocaInst *AI = PromotableAllocas[Idx]; 3505 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3506 UI != UE;) { 3507 Instruction *I = cast<Instruction>(*UI++); 3508 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3509 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3510 // leading to them) here. Eventually it should use them to optimize the 3511 // scalar values produced. 3512 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3513 assert(onlyUsedByLifetimeMarkers(I) && 3514 "Found a bitcast used outside of a lifetime marker."); 3515 while (!I->use_empty()) 3516 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3517 I->eraseFromParent(); 3518 continue; 3519 } 3520 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3521 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3522 II->getIntrinsicID() == Intrinsic::lifetime_end); 3523 II->eraseFromParent(); 3524 continue; 3525 } 3526 3527 Insts.push_back(I); 3528 } 3529 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3530 Insts.clear(); 3531 } 3532 3533 PromotableAllocas.clear(); 3534 return true; 3535} 3536 3537namespace { 3538 /// \brief A predicate to test whether an alloca belongs to a set. 3539 class IsAllocaInSet { 3540 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3541 const SetType &Set; 3542 3543 public: 3544 typedef AllocaInst *argument_type; 3545 3546 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3547 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3548 }; 3549} 3550 3551bool SROA::runOnFunction(Function &F) { 3552 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3553 C = &F.getContext(); 3554 TD = getAnalysisIfAvailable<DataLayout>(); 3555 if (!TD) { 3556 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3557 return false; 3558 } 3559 DT = getAnalysisIfAvailable<DominatorTree>(); 3560 3561 BasicBlock &EntryBB = F.getEntryBlock(); 3562 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3563 I != E; ++I) 3564 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3565 Worklist.insert(AI); 3566 3567 bool Changed = false; 3568 // A set of deleted alloca instruction pointers which should be removed from 3569 // the list of promotable allocas. 3570 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3571 3572 do { 3573 while (!Worklist.empty()) { 3574 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3575 deleteDeadInstructions(DeletedAllocas); 3576 3577 // Remove the deleted allocas from various lists so that we don't try to 3578 // continue processing them. 3579 if (!DeletedAllocas.empty()) { 3580 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3581 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3582 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3583 PromotableAllocas.end(), 3584 IsAllocaInSet(DeletedAllocas)), 3585 PromotableAllocas.end()); 3586 DeletedAllocas.clear(); 3587 } 3588 } 3589 3590 Changed |= promoteAllocas(F); 3591 3592 Worklist = PostPromotionWorklist; 3593 PostPromotionWorklist.clear(); 3594 } while (!Worklist.empty()); 3595 3596 return Changed; 3597} 3598 3599void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3600 if (RequiresDomTree) 3601 AU.addRequired<DominatorTree>(); 3602 AU.setPreservesCFG(); 3603} 3604