SROA.cpp revision b8bce928f4ffdf50eff69334f3e25b27848536b6
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/Constants.h" 29#include "llvm/DIBuilder.h" 30#include "llvm/DebugInfo.h" 31#include "llvm/DerivedTypes.h" 32#include "llvm/Function.h" 33#include "llvm/IRBuilder.h" 34#include "llvm/Instructions.h" 35#include "llvm/IntrinsicInst.h" 36#include "llvm/LLVMContext.h" 37#include "llvm/Module.h" 38#include "llvm/Operator.h" 39#include "llvm/Pass.h" 40#include "llvm/ADT/SetVector.h" 41#include "llvm/ADT/SmallVector.h" 42#include "llvm/ADT/Statistic.h" 43#include "llvm/ADT/STLExtras.h" 44#include "llvm/Analysis/Dominators.h" 45#include "llvm/Analysis/Loads.h" 46#include "llvm/Analysis/ValueTracking.h" 47#include "llvm/Support/CommandLine.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/GetElementPtrTypeIterator.h" 51#include "llvm/Support/InstVisitor.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54#include "llvm/DataLayout.h" 55#include "llvm/Transforms/Utils/Local.h" 56#include "llvm/Transforms/Utils/PromoteMemToReg.h" 57#include "llvm/Transforms/Utils/SSAUpdater.h" 58using namespace llvm; 59 60STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 61STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 62STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 63STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 64STATISTIC(NumDeleted, "Number of instructions deleted"); 65STATISTIC(NumVectorized, "Number of vectorized aggregates"); 66 67/// Hidden option to force the pass to not use DomTree and mem2reg, instead 68/// forming SSA values through the SSAUpdater infrastructure. 69static cl::opt<bool> 70ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 71 72namespace { 73/// \brief Alloca partitioning representation. 74/// 75/// This class represents a partitioning of an alloca into slices, and 76/// information about the nature of uses of each slice of the alloca. The goal 77/// is that this information is sufficient to decide if and how to split the 78/// alloca apart and replace slices with scalars. It is also intended that this 79/// structure can capture the relevant information needed both to decide about 80/// and to enact these transformations. 81class AllocaPartitioning { 82public: 83 /// \brief A common base class for representing a half-open byte range. 84 struct ByteRange { 85 /// \brief The beginning offset of the range. 86 uint64_t BeginOffset; 87 88 /// \brief The ending offset, not included in the range. 89 uint64_t EndOffset; 90 91 ByteRange() : BeginOffset(), EndOffset() {} 92 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 93 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 94 95 /// \brief Support for ordering ranges. 96 /// 97 /// This provides an ordering over ranges such that start offsets are 98 /// always increasing, and within equal start offsets, the end offsets are 99 /// decreasing. Thus the spanning range comes first in a cluster with the 100 /// same start position. 101 bool operator<(const ByteRange &RHS) const { 102 if (BeginOffset < RHS.BeginOffset) return true; 103 if (BeginOffset > RHS.BeginOffset) return false; 104 if (EndOffset > RHS.EndOffset) return true; 105 return false; 106 } 107 108 /// \brief Support comparison with a single offset to allow binary searches. 109 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 110 return LHS.BeginOffset < RHSOffset; 111 } 112 113 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 114 const ByteRange &RHS) { 115 return LHSOffset < RHS.BeginOffset; 116 } 117 118 bool operator==(const ByteRange &RHS) const { 119 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 120 } 121 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 122 }; 123 124 /// \brief A partition of an alloca. 125 /// 126 /// This structure represents a contiguous partition of the alloca. These are 127 /// formed by examining the uses of the alloca. During formation, they may 128 /// overlap but once an AllocaPartitioning is built, the Partitions within it 129 /// are all disjoint. 130 struct Partition : public ByteRange { 131 /// \brief Whether this partition is splittable into smaller partitions. 132 /// 133 /// We flag partitions as splittable when they are formed entirely due to 134 /// accesses by trivially splittable operations such as memset and memcpy. 135 bool IsSplittable; 136 137 /// \brief Test whether a partition has been marked as dead. 138 bool isDead() const { 139 if (BeginOffset == UINT64_MAX) { 140 assert(EndOffset == UINT64_MAX); 141 return true; 142 } 143 return false; 144 } 145 146 /// \brief Kill a partition. 147 /// This is accomplished by setting both its beginning and end offset to 148 /// the maximum possible value. 149 void kill() { 150 assert(!isDead() && "He's Dead, Jim!"); 151 BeginOffset = EndOffset = UINT64_MAX; 152 } 153 154 Partition() : ByteRange(), IsSplittable() {} 155 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 156 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 157 }; 158 159 /// \brief A particular use of a partition of the alloca. 160 /// 161 /// This structure is used to associate uses of a partition with it. They 162 /// mark the range of bytes which are referenced by a particular instruction, 163 /// and includes a handle to the user itself and the pointer value in use. 164 /// The bounds of these uses are determined by intersecting the bounds of the 165 /// memory use itself with a particular partition. As a consequence there is 166 /// intentionally overlap between various uses of the same partition. 167 struct PartitionUse : public ByteRange { 168 /// \brief The use in question. Provides access to both user and used value. 169 /// 170 /// Note that this may be null if the partition use is *dead*, that is, it 171 /// should be ignored. 172 Use *U; 173 174 PartitionUse() : ByteRange(), U() {} 175 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) 176 : ByteRange(BeginOffset, EndOffset), U(U) {} 177 }; 178 179 /// \brief Construct a partitioning of a particular alloca. 180 /// 181 /// Construction does most of the work for partitioning the alloca. This 182 /// performs the necessary walks of users and builds a partitioning from it. 183 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 184 185 /// \brief Test whether a pointer to the allocation escapes our analysis. 186 /// 187 /// If this is true, the partitioning is never fully built and should be 188 /// ignored. 189 bool isEscaped() const { return PointerEscapingInstr; } 190 191 /// \brief Support for iterating over the partitions. 192 /// @{ 193 typedef SmallVectorImpl<Partition>::iterator iterator; 194 iterator begin() { return Partitions.begin(); } 195 iterator end() { return Partitions.end(); } 196 197 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 198 const_iterator begin() const { return Partitions.begin(); } 199 const_iterator end() const { return Partitions.end(); } 200 /// @} 201 202 /// \brief Support for iterating over and manipulating a particular 203 /// partition's uses. 204 /// 205 /// The iteration support provided for uses is more limited, but also 206 /// includes some manipulation routines to support rewriting the uses of 207 /// partitions during SROA. 208 /// @{ 209 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 210 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 211 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 212 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 213 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 214 215 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 216 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 217 const_use_iterator use_begin(const_iterator I) const { 218 return Uses[I - begin()].begin(); 219 } 220 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 221 const_use_iterator use_end(const_iterator I) const { 222 return Uses[I - begin()].end(); 223 } 224 225 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 226 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 227 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 228 return Uses[PIdx][UIdx]; 229 } 230 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 231 return Uses[I - begin()][UIdx]; 232 } 233 234 void use_push_back(unsigned Idx, const PartitionUse &PU) { 235 Uses[Idx].push_back(PU); 236 } 237 void use_push_back(const_iterator I, const PartitionUse &PU) { 238 Uses[I - begin()].push_back(PU); 239 } 240 /// @} 241 242 /// \brief Allow iterating the dead users for this alloca. 243 /// 244 /// These are instructions which will never actually use the alloca as they 245 /// are outside the allocated range. They are safe to replace with undef and 246 /// delete. 247 /// @{ 248 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 249 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 250 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 251 /// @} 252 253 /// \brief Allow iterating the dead expressions referring to this alloca. 254 /// 255 /// These are operands which have cannot actually be used to refer to the 256 /// alloca as they are outside its range and the user doesn't correct for 257 /// that. These mostly consist of PHI node inputs and the like which we just 258 /// need to replace with undef. 259 /// @{ 260 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 261 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 262 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 263 /// @} 264 265 /// \brief MemTransferInst auxiliary data. 266 /// This struct provides some auxiliary data about memory transfer 267 /// intrinsics such as memcpy and memmove. These intrinsics can use two 268 /// different ranges within the same alloca, and provide other challenges to 269 /// correctly represent. We stash extra data to help us untangle this 270 /// after the partitioning is complete. 271 struct MemTransferOffsets { 272 /// The destination begin and end offsets when the destination is within 273 /// this alloca. If the end offset is zero the destination is not within 274 /// this alloca. 275 uint64_t DestBegin, DestEnd; 276 277 /// The source begin and end offsets when the source is within this alloca. 278 /// If the end offset is zero, the source is not within this alloca. 279 uint64_t SourceBegin, SourceEnd; 280 281 /// Flag for whether an alloca is splittable. 282 bool IsSplittable; 283 }; 284 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 285 return MemTransferInstData.lookup(&II); 286 } 287 288 /// \brief Map from a PHI or select operand back to a partition. 289 /// 290 /// When manipulating PHI nodes or selects, they can use more than one 291 /// partition of an alloca. We store a special mapping to allow finding the 292 /// partition referenced by each of these operands, if any. 293 iterator findPartitionForPHIOrSelectOperand(Use *U) { 294 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 295 = PHIOrSelectOpMap.find(U); 296 if (MapIt == PHIOrSelectOpMap.end()) 297 return end(); 298 299 return begin() + MapIt->second.first; 300 } 301 302 /// \brief Map from a PHI or select operand back to the specific use of 303 /// a partition. 304 /// 305 /// Similar to mapping these operands back to the partitions, this maps 306 /// directly to the use structure of that partition. 307 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 308 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 309 = PHIOrSelectOpMap.find(U); 310 assert(MapIt != PHIOrSelectOpMap.end()); 311 return Uses[MapIt->second.first].begin() + MapIt->second.second; 312 } 313 314 /// \brief Compute a common type among the uses of a particular partition. 315 /// 316 /// This routines walks all of the uses of a particular partition and tries 317 /// to find a common type between them. Untyped operations such as memset and 318 /// memcpy are ignored. 319 Type *getCommonType(iterator I) const; 320 321#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 322 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 323 void printUsers(raw_ostream &OS, const_iterator I, 324 StringRef Indent = " ") const; 325 void print(raw_ostream &OS) const; 326 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 327 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 328#endif 329 330private: 331 template <typename DerivedT, typename RetT = void> class BuilderBase; 332 class PartitionBuilder; 333 friend class AllocaPartitioning::PartitionBuilder; 334 class UseBuilder; 335 friend class AllocaPartitioning::UseBuilder; 336 337#ifndef NDEBUG 338 /// \brief Handle to alloca instruction to simplify method interfaces. 339 AllocaInst &AI; 340#endif 341 342 /// \brief The instruction responsible for this alloca having no partitioning. 343 /// 344 /// When an instruction (potentially) escapes the pointer to the alloca, we 345 /// store a pointer to that here and abort trying to partition the alloca. 346 /// This will be null if the alloca is partitioned successfully. 347 Instruction *PointerEscapingInstr; 348 349 /// \brief The partitions of the alloca. 350 /// 351 /// We store a vector of the partitions over the alloca here. This vector is 352 /// sorted by increasing begin offset, and then by decreasing end offset. See 353 /// the Partition inner class for more details. Initially (during 354 /// construction) there are overlaps, but we form a disjoint sequence of 355 /// partitions while finishing construction and a fully constructed object is 356 /// expected to always have this as a disjoint space. 357 SmallVector<Partition, 8> Partitions; 358 359 /// \brief The uses of the partitions. 360 /// 361 /// This is essentially a mapping from each partition to a list of uses of 362 /// that partition. The mapping is done with a Uses vector that has the exact 363 /// same number of entries as the partition vector. Each entry is itself 364 /// a vector of the uses. 365 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 366 367 /// \brief Instructions which will become dead if we rewrite the alloca. 368 /// 369 /// Note that these are not separated by partition. This is because we expect 370 /// a partitioned alloca to be completely rewritten or not rewritten at all. 371 /// If rewritten, all these instructions can simply be removed and replaced 372 /// with undef as they come from outside of the allocated space. 373 SmallVector<Instruction *, 8> DeadUsers; 374 375 /// \brief Operands which will become dead if we rewrite the alloca. 376 /// 377 /// These are operands that in their particular use can be replaced with 378 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 379 /// to PHI nodes and the like. They aren't entirely dead (there might be 380 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 381 /// want to swap this particular input for undef to simplify the use lists of 382 /// the alloca. 383 SmallVector<Use *, 8> DeadOperands; 384 385 /// \brief The underlying storage for auxiliary memcpy and memset info. 386 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 387 388 /// \brief A side datastructure used when building up the partitions and uses. 389 /// 390 /// This mapping is only really used during the initial building of the 391 /// partitioning so that we can retain information about PHI and select nodes 392 /// processed. 393 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 394 395 /// \brief Auxiliary information for particular PHI or select operands. 396 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 397 398 /// \brief A utility routine called from the constructor. 399 /// 400 /// This does what it says on the tin. It is the key of the alloca partition 401 /// splitting and merging. After it is called we have the desired disjoint 402 /// collection of partitions. 403 void splitAndMergePartitions(); 404}; 405} 406 407template <typename DerivedT, typename RetT> 408class AllocaPartitioning::BuilderBase 409 : public InstVisitor<DerivedT, RetT> { 410public: 411 BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 412 : TD(TD), 413 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 414 P(P) { 415 enqueueUsers(AI, 0); 416 } 417 418protected: 419 const DataLayout &TD; 420 const uint64_t AllocSize; 421 AllocaPartitioning &P; 422 423 SmallPtrSet<Use *, 8> VisitedUses; 424 425 struct OffsetUse { 426 Use *U; 427 int64_t Offset; 428 }; 429 SmallVector<OffsetUse, 8> Queue; 430 431 // The active offset and use while visiting. 432 Use *U; 433 int64_t Offset; 434 435 void enqueueUsers(Instruction &I, int64_t UserOffset) { 436 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); 437 UI != UE; ++UI) { 438 if (VisitedUses.insert(&UI.getUse())) { 439 OffsetUse OU = { &UI.getUse(), UserOffset }; 440 Queue.push_back(OU); 441 } 442 } 443 } 444 445 bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) { 446 GEPOffset = Offset; 447 unsigned int AS = GEPI.getPointerAddressSpace(); 448 for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); 449 GTI != GTE; ++GTI) { 450 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 451 if (!OpC) 452 return false; 453 if (OpC->isZero()) 454 continue; 455 456 // Handle a struct index, which adds its field offset to the pointer. 457 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 458 unsigned ElementIdx = OpC->getZExtValue(); 459 const StructLayout *SL = TD.getStructLayout(STy); 460 uint64_t ElementOffset = SL->getElementOffset(ElementIdx); 461 // Check that we can continue to model this GEP in a signed 64-bit offset. 462 if (ElementOffset > INT64_MAX || 463 (GEPOffset >= 0 && 464 ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) { 465 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 466 << "what can be represented in an int64_t!\n" 467 << " alloca: " << P.AI << "\n"); 468 return false; 469 } 470 if (GEPOffset < 0) 471 GEPOffset = ElementOffset + (uint64_t)-GEPOffset; 472 else 473 GEPOffset += ElementOffset; 474 continue; 475 } 476 477 APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits(AS)); 478 Index *= APInt(Index.getBitWidth(), 479 TD.getTypeAllocSize(GTI.getIndexedType())); 480 Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset, 481 /*isSigned*/true); 482 // Check if the result can be stored in our int64_t offset. 483 if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) { 484 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 485 << "what can be represented in an int64_t!\n" 486 << " alloca: " << P.AI << "\n"); 487 return false; 488 } 489 490 GEPOffset = Index.getSExtValue(); 491 } 492 return true; 493 } 494 495 Value *foldSelectInst(SelectInst &SI) { 496 // If the condition being selected on is a constant or the same value is 497 // being selected between, fold the select. Yes this does (rarely) happen 498 // early on. 499 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 500 return SI.getOperand(1+CI->isZero()); 501 if (SI.getOperand(1) == SI.getOperand(2)) { 502 assert(*U == SI.getOperand(1)); 503 return SI.getOperand(1); 504 } 505 return 0; 506 } 507}; 508 509/// \brief Builder for the alloca partitioning. 510/// 511/// This class builds an alloca partitioning by recursively visiting the uses 512/// of an alloca and splitting the partitions for each load and store at each 513/// offset. 514class AllocaPartitioning::PartitionBuilder 515 : public BuilderBase<PartitionBuilder, bool> { 516 friend class InstVisitor<PartitionBuilder, bool>; 517 518 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 519 520public: 521 PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 522 : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {} 523 524 /// \brief Run the builder over the allocation. 525 bool operator()() { 526 // Note that we have to re-evaluate size on each trip through the loop as 527 // the queue grows at the tail. 528 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 529 U = Queue[Idx].U; 530 Offset = Queue[Idx].Offset; 531 if (!visit(cast<Instruction>(U->getUser()))) 532 return false; 533 } 534 return true; 535 } 536 537private: 538 bool markAsEscaping(Instruction &I) { 539 P.PointerEscapingInstr = &I; 540 return false; 541 } 542 543 void insertUse(Instruction &I, int64_t Offset, uint64_t Size, 544 bool IsSplittable = false) { 545 // Completely skip uses which have a zero size or don't overlap the 546 // allocation. 547 if (Size == 0 || 548 (Offset >= 0 && (uint64_t)Offset >= AllocSize) || 549 (Offset < 0 && (uint64_t)-Offset >= Size)) { 550 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 551 << " which starts past the end of the " << AllocSize 552 << " byte alloca:\n" 553 << " alloca: " << P.AI << "\n" 554 << " use: " << I << "\n"); 555 return; 556 } 557 558 // Clamp the start to the beginning of the allocation. 559 if (Offset < 0) { 560 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 561 << " to start at the beginning of the alloca:\n" 562 << " alloca: " << P.AI << "\n" 563 << " use: " << I << "\n"); 564 Size -= (uint64_t)-Offset; 565 Offset = 0; 566 } 567 568 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 569 570 // Clamp the end offset to the end of the allocation. Note that this is 571 // formulated to handle even the case where "BeginOffset + Size" overflows. 572 assert(AllocSize >= BeginOffset); // Established above. 573 if (Size > AllocSize - BeginOffset) { 574 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 575 << " to remain within the " << AllocSize << " byte alloca:\n" 576 << " alloca: " << P.AI << "\n" 577 << " use: " << I << "\n"); 578 EndOffset = AllocSize; 579 } 580 581 Partition New(BeginOffset, EndOffset, IsSplittable); 582 P.Partitions.push_back(New); 583 } 584 585 bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 586 uint64_t Size = TD.getTypeStoreSize(Ty); 587 588 // If this memory access can be shown to *statically* extend outside the 589 // bounds of of the allocation, it's behavior is undefined, so simply 590 // ignore it. Note that this is more strict than the generic clamping 591 // behavior of insertUse. We also try to handle cases which might run the 592 // risk of overflow. 593 // FIXME: We should instead consider the pointer to have escaped if this 594 // function is being instrumented for addressing bugs or race conditions. 595 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 596 Size > (AllocSize - (uint64_t)Offset)) { 597 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 598 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 599 << " which extends past the end of the " << AllocSize 600 << " byte alloca:\n" 601 << " alloca: " << P.AI << "\n" 602 << " use: " << I << "\n"); 603 return true; 604 } 605 606 insertUse(I, Offset, Size); 607 return true; 608 } 609 610 bool visitBitCastInst(BitCastInst &BC) { 611 enqueueUsers(BC, Offset); 612 return true; 613 } 614 615 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 616 int64_t GEPOffset; 617 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 618 return markAsEscaping(GEPI); 619 620 enqueueUsers(GEPI, GEPOffset); 621 return true; 622 } 623 624 bool visitLoadInst(LoadInst &LI) { 625 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 626 "All simple FCA loads should have been pre-split"); 627 return handleLoadOrStore(LI.getType(), LI, Offset); 628 } 629 630 bool visitStoreInst(StoreInst &SI) { 631 Value *ValOp = SI.getValueOperand(); 632 if (ValOp == *U) 633 return markAsEscaping(SI); 634 635 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 636 "All simple FCA stores should have been pre-split"); 637 return handleLoadOrStore(ValOp->getType(), SI, Offset); 638 } 639 640 641 bool visitMemSetInst(MemSetInst &II) { 642 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 643 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 644 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 645 insertUse(II, Offset, Size, Length); 646 return true; 647 } 648 649 bool visitMemTransferInst(MemTransferInst &II) { 650 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 651 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 652 if (!Size) 653 // Zero-length mem transfer intrinsics can be ignored entirely. 654 return true; 655 656 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 657 658 // Only intrinsics with a constant length can be split. 659 Offsets.IsSplittable = Length; 660 661 if (*U == II.getRawDest()) { 662 Offsets.DestBegin = Offset; 663 Offsets.DestEnd = Offset + Size; 664 } 665 if (*U == II.getRawSource()) { 666 Offsets.SourceBegin = Offset; 667 Offsets.SourceEnd = Offset + Size; 668 } 669 670 // If we have set up end offsets for both the source and the destination, 671 // we have found both sides of this transfer pointing at the same alloca. 672 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 673 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 674 unsigned PrevIdx = MemTransferPartitionMap[&II]; 675 676 // Check if the begin offsets match and this is a non-volatile transfer. 677 // In that case, we can completely elide the transfer. 678 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 679 P.Partitions[PrevIdx].kill(); 680 return true; 681 } 682 683 // Otherwise we have an offset transfer within the same alloca. We can't 684 // split those. 685 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 686 } else if (SeenBothEnds) { 687 // Handle the case where this exact use provides both ends of the 688 // operation. 689 assert(II.getRawDest() == II.getRawSource()); 690 691 // For non-volatile transfers this is a no-op. 692 if (!II.isVolatile()) 693 return true; 694 695 // Otherwise just suppress splitting. 696 Offsets.IsSplittable = false; 697 } 698 699 700 // Insert the use now that we've fixed up the splittable nature. 701 insertUse(II, Offset, Size, Offsets.IsSplittable); 702 703 // Setup the mapping from intrinsic to partition of we've not seen both 704 // ends of this transfer. 705 if (!SeenBothEnds) { 706 unsigned NewIdx = P.Partitions.size() - 1; 707 bool Inserted 708 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 709 assert(Inserted && 710 "Already have intrinsic in map but haven't seen both ends"); 711 (void)Inserted; 712 } 713 714 return true; 715 } 716 717 // Disable SRoA for any intrinsics except for lifetime invariants. 718 // FIXME: What about debug instrinsics? This matches old behavior, but 719 // doesn't make sense. 720 bool visitIntrinsicInst(IntrinsicInst &II) { 721 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 722 II.getIntrinsicID() == Intrinsic::lifetime_end) { 723 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 724 uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue()); 725 insertUse(II, Offset, Size, true); 726 return true; 727 } 728 729 return markAsEscaping(II); 730 } 731 732 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 733 // We consider any PHI or select that results in a direct load or store of 734 // the same offset to be a viable use for partitioning purposes. These uses 735 // are considered unsplittable and the size is the maximum loaded or stored 736 // size. 737 SmallPtrSet<Instruction *, 4> Visited; 738 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 739 Visited.insert(Root); 740 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 741 // If there are no loads or stores, the access is dead. We mark that as 742 // a size zero access. 743 Size = 0; 744 do { 745 Instruction *I, *UsedI; 746 llvm::tie(UsedI, I) = Uses.pop_back_val(); 747 748 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 749 Size = std::max(Size, TD.getTypeStoreSize(LI->getType())); 750 continue; 751 } 752 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 753 Value *Op = SI->getOperand(0); 754 if (Op == UsedI) 755 return SI; 756 Size = std::max(Size, TD.getTypeStoreSize(Op->getType())); 757 continue; 758 } 759 760 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 761 if (!GEP->hasAllZeroIndices()) 762 return GEP; 763 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 764 !isa<SelectInst>(I)) { 765 return I; 766 } 767 768 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 769 ++UI) 770 if (Visited.insert(cast<Instruction>(*UI))) 771 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 772 } while (!Uses.empty()); 773 774 return 0; 775 } 776 777 bool visitPHINode(PHINode &PN) { 778 // See if we already have computed info on this node. 779 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 780 if (PHIInfo.first) { 781 PHIInfo.second = true; 782 insertUse(PN, Offset, PHIInfo.first); 783 return true; 784 } 785 786 // Check for an unsafe use of the PHI node. 787 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 788 return markAsEscaping(*EscapingI); 789 790 insertUse(PN, Offset, PHIInfo.first); 791 return true; 792 } 793 794 bool visitSelectInst(SelectInst &SI) { 795 if (Value *Result = foldSelectInst(SI)) { 796 if (Result == *U) 797 // If the result of the constant fold will be the pointer, recurse 798 // through the select as if we had RAUW'ed it. 799 enqueueUsers(SI, Offset); 800 801 return true; 802 } 803 804 // See if we already have computed info on this node. 805 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 806 if (SelectInfo.first) { 807 SelectInfo.second = true; 808 insertUse(SI, Offset, SelectInfo.first); 809 return true; 810 } 811 812 // Check for an unsafe use of the PHI node. 813 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 814 return markAsEscaping(*EscapingI); 815 816 insertUse(SI, Offset, SelectInfo.first); 817 return true; 818 } 819 820 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 821 bool visitInstruction(Instruction &I) { return markAsEscaping(I); } 822}; 823 824 825/// \brief Use adder for the alloca partitioning. 826/// 827/// This class adds the uses of an alloca to all of the partitions which they 828/// use. For splittable partitions, this can end up doing essentially a linear 829/// walk of the partitions, but the number of steps remains bounded by the 830/// total result instruction size: 831/// - The number of partitions is a result of the number unsplittable 832/// instructions using the alloca. 833/// - The number of users of each partition is at worst the total number of 834/// splittable instructions using the alloca. 835/// Thus we will produce N * M instructions in the end, where N are the number 836/// of unsplittable uses and M are the number of splittable. This visitor does 837/// the exact same number of updates to the partitioning. 838/// 839/// In the more common case, this visitor will leverage the fact that the 840/// partition space is pre-sorted, and do a logarithmic search for the 841/// partition needed, making the total visit a classical ((N + M) * log(N)) 842/// complexity operation. 843class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> { 844 friend class InstVisitor<UseBuilder>; 845 846 /// \brief Set to de-duplicate dead instructions found in the use walk. 847 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 848 849public: 850 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 851 : BuilderBase<UseBuilder>(TD, AI, P) {} 852 853 /// \brief Run the builder over the allocation. 854 void operator()() { 855 // Note that we have to re-evaluate size on each trip through the loop as 856 // the queue grows at the tail. 857 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 858 U = Queue[Idx].U; 859 Offset = Queue[Idx].Offset; 860 this->visit(cast<Instruction>(U->getUser())); 861 } 862 } 863 864private: 865 void markAsDead(Instruction &I) { 866 if (VisitedDeadInsts.insert(&I)) 867 P.DeadUsers.push_back(&I); 868 } 869 870 void insertUse(Instruction &User, int64_t Offset, uint64_t Size) { 871 // If the use has a zero size or extends outside of the allocation, record 872 // it as a dead use for elimination later. 873 if (Size == 0 || (uint64_t)Offset >= AllocSize || 874 (Offset < 0 && (uint64_t)-Offset >= Size)) 875 return markAsDead(User); 876 877 // Clamp the start to the beginning of the allocation. 878 if (Offset < 0) { 879 Size -= (uint64_t)-Offset; 880 Offset = 0; 881 } 882 883 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 884 885 // Clamp the end offset to the end of the allocation. Note that this is 886 // formulated to handle even the case where "BeginOffset + Size" overflows. 887 assert(AllocSize >= BeginOffset); // Established above. 888 if (Size > AllocSize - BeginOffset) 889 EndOffset = AllocSize; 890 891 // NB: This only works if we have zero overlapping partitions. 892 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 893 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 894 B = llvm::prior(B); 895 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 896 ++I) { 897 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 898 std::min(I->EndOffset, EndOffset), U); 899 P.use_push_back(I, NewPU); 900 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 901 P.PHIOrSelectOpMap[U] 902 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 903 } 904 } 905 906 void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 907 uint64_t Size = TD.getTypeStoreSize(Ty); 908 909 // If this memory access can be shown to *statically* extend outside the 910 // bounds of of the allocation, it's behavior is undefined, so simply 911 // ignore it. Note that this is more strict than the generic clamping 912 // behavior of insertUse. 913 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 914 Size > (AllocSize - (uint64_t)Offset)) 915 return markAsDead(I); 916 917 insertUse(I, Offset, Size); 918 } 919 920 void visitBitCastInst(BitCastInst &BC) { 921 if (BC.use_empty()) 922 return markAsDead(BC); 923 924 enqueueUsers(BC, Offset); 925 } 926 927 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 928 if (GEPI.use_empty()) 929 return markAsDead(GEPI); 930 931 int64_t GEPOffset; 932 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 933 llvm_unreachable("Unable to compute constant offset for use"); 934 935 enqueueUsers(GEPI, GEPOffset); 936 } 937 938 void visitLoadInst(LoadInst &LI) { 939 handleLoadOrStore(LI.getType(), LI, Offset); 940 } 941 942 void visitStoreInst(StoreInst &SI) { 943 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 944 } 945 946 void visitMemSetInst(MemSetInst &II) { 947 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 948 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 949 insertUse(II, Offset, Size); 950 } 951 952 void visitMemTransferInst(MemTransferInst &II) { 953 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 954 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 955 if (!Size) 956 return markAsDead(II); 957 958 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 959 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 960 Offsets.DestBegin == Offsets.SourceBegin) 961 return markAsDead(II); // Skip identity transfers without side-effects. 962 963 insertUse(II, Offset, Size); 964 } 965 966 void visitIntrinsicInst(IntrinsicInst &II) { 967 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 968 II.getIntrinsicID() == Intrinsic::lifetime_end); 969 970 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 971 insertUse(II, Offset, 972 std::min(AllocSize - Offset, Length->getLimitedValue())); 973 } 974 975 void insertPHIOrSelect(Instruction &User, uint64_t Offset) { 976 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 977 978 // For PHI and select operands outside the alloca, we can't nuke the entire 979 // phi or select -- the other side might still be relevant, so we special 980 // case them here and use a separate structure to track the operands 981 // themselves which should be replaced with undef. 982 if (Offset >= AllocSize) { 983 P.DeadOperands.push_back(U); 984 return; 985 } 986 987 insertUse(User, Offset, Size); 988 } 989 void visitPHINode(PHINode &PN) { 990 if (PN.use_empty()) 991 return markAsDead(PN); 992 993 insertPHIOrSelect(PN, Offset); 994 } 995 void visitSelectInst(SelectInst &SI) { 996 if (SI.use_empty()) 997 return markAsDead(SI); 998 999 if (Value *Result = foldSelectInst(SI)) { 1000 if (Result == *U) 1001 // If the result of the constant fold will be the pointer, recurse 1002 // through the select as if we had RAUW'ed it. 1003 enqueueUsers(SI, Offset); 1004 else 1005 // Otherwise the operand to the select is dead, and we can replace it 1006 // with undef. 1007 P.DeadOperands.push_back(U); 1008 1009 return; 1010 } 1011 1012 insertPHIOrSelect(SI, Offset); 1013 } 1014 1015 /// \brief Unreachable, we've already visited the alloca once. 1016 void visitInstruction(Instruction &I) { 1017 llvm_unreachable("Unhandled instruction in use builder."); 1018 } 1019}; 1020 1021void AllocaPartitioning::splitAndMergePartitions() { 1022 size_t NumDeadPartitions = 0; 1023 1024 // Track the range of splittable partitions that we pass when accumulating 1025 // overlapping unsplittable partitions. 1026 uint64_t SplitEndOffset = 0ull; 1027 1028 Partition New(0ull, 0ull, false); 1029 1030 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 1031 ++j; 1032 1033 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 1034 assert(New.BeginOffset == New.EndOffset); 1035 New = Partitions[i]; 1036 } else { 1037 assert(New.IsSplittable); 1038 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 1039 } 1040 assert(New.BeginOffset != New.EndOffset); 1041 1042 // Scan the overlapping partitions. 1043 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 1044 // If the new partition we are forming is splittable, stop at the first 1045 // unsplittable partition. 1046 if (New.IsSplittable && !Partitions[j].IsSplittable) 1047 break; 1048 1049 // Grow the new partition to include any equally splittable range. 'j' is 1050 // always equally splittable when New is splittable, but when New is not 1051 // splittable, we may subsume some (or part of some) splitable partition 1052 // without growing the new one. 1053 if (New.IsSplittable == Partitions[j].IsSplittable) { 1054 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 1055 } else { 1056 assert(!New.IsSplittable); 1057 assert(Partitions[j].IsSplittable); 1058 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 1059 } 1060 1061 Partitions[j].kill(); 1062 ++NumDeadPartitions; 1063 ++j; 1064 } 1065 1066 // If the new partition is splittable, chop off the end as soon as the 1067 // unsplittable subsequent partition starts and ensure we eventually cover 1068 // the splittable area. 1069 if (j != e && New.IsSplittable) { 1070 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 1071 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1072 } 1073 1074 // Add the new partition if it differs from the original one and is 1075 // non-empty. We can end up with an empty partition here if it was 1076 // splittable but there is an unsplittable one that starts at the same 1077 // offset. 1078 if (New != Partitions[i]) { 1079 if (New.BeginOffset != New.EndOffset) 1080 Partitions.push_back(New); 1081 // Mark the old one for removal. 1082 Partitions[i].kill(); 1083 ++NumDeadPartitions; 1084 } 1085 1086 New.BeginOffset = New.EndOffset; 1087 if (!New.IsSplittable) { 1088 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1089 if (j != e && !Partitions[j].IsSplittable) 1090 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1091 New.IsSplittable = true; 1092 // If there is a trailing splittable partition which won't be fused into 1093 // the next splittable partition go ahead and add it onto the partitions 1094 // list. 1095 if (New.BeginOffset < New.EndOffset && 1096 (j == e || !Partitions[j].IsSplittable || 1097 New.EndOffset < Partitions[j].BeginOffset)) { 1098 Partitions.push_back(New); 1099 New.BeginOffset = New.EndOffset = 0ull; 1100 } 1101 } 1102 } 1103 1104 // Re-sort the partitions now that they have been split and merged into 1105 // disjoint set of partitions. Also remove any of the dead partitions we've 1106 // replaced in the process. 1107 std::sort(Partitions.begin(), Partitions.end()); 1108 if (NumDeadPartitions) { 1109 assert(Partitions.back().isDead()); 1110 assert((ptrdiff_t)NumDeadPartitions == 1111 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1112 } 1113 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1114} 1115 1116AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1117 : 1118#ifndef NDEBUG 1119 AI(AI), 1120#endif 1121 PointerEscapingInstr(0) { 1122 PartitionBuilder PB(TD, AI, *this); 1123 if (!PB()) 1124 return; 1125 1126 // Sort the uses. This arranges for the offsets to be in ascending order, 1127 // and the sizes to be in descending order. 1128 std::sort(Partitions.begin(), Partitions.end()); 1129 1130 // Remove any partitions from the back which are marked as dead. 1131 while (!Partitions.empty() && Partitions.back().isDead()) 1132 Partitions.pop_back(); 1133 1134 if (Partitions.size() > 1) { 1135 // Intersect splittability for all partitions with equal offsets and sizes. 1136 // Then remove all but the first so that we have a sequence of non-equal but 1137 // potentially overlapping partitions. 1138 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1139 I = J) { 1140 ++J; 1141 while (J != E && *I == *J) { 1142 I->IsSplittable &= J->IsSplittable; 1143 ++J; 1144 } 1145 } 1146 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1147 Partitions.end()); 1148 1149 // Split splittable and merge unsplittable partitions into a disjoint set 1150 // of partitions over the used space of the allocation. 1151 splitAndMergePartitions(); 1152 } 1153 1154 // Now build up the user lists for each of these disjoint partitions by 1155 // re-walking the recursive users of the alloca. 1156 Uses.resize(Partitions.size()); 1157 UseBuilder UB(TD, AI, *this); 1158 UB(); 1159} 1160 1161Type *AllocaPartitioning::getCommonType(iterator I) const { 1162 Type *Ty = 0; 1163 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1164 if (!UI->U) 1165 continue; // Skip dead uses. 1166 if (isa<IntrinsicInst>(*UI->U->getUser())) 1167 continue; 1168 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1169 continue; 1170 1171 Type *UserTy = 0; 1172 if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { 1173 UserTy = LI->getType(); 1174 } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { 1175 UserTy = SI->getValueOperand()->getType(); 1176 } 1177 1178 if (Ty && Ty != UserTy) 1179 return 0; 1180 1181 Ty = UserTy; 1182 } 1183 return Ty; 1184} 1185 1186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1187 1188void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1189 StringRef Indent) const { 1190 OS << Indent << "partition #" << (I - begin()) 1191 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1192 << (I->IsSplittable ? " (splittable)" : "") 1193 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1194 << "\n"; 1195} 1196 1197void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1198 StringRef Indent) const { 1199 for (const_use_iterator UI = use_begin(I), UE = use_end(I); 1200 UI != UE; ++UI) { 1201 if (!UI->U) 1202 continue; // Skip dead uses. 1203 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1204 << "used by: " << *UI->U->getUser() << "\n"; 1205 if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { 1206 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1207 bool IsDest; 1208 if (!MTO.IsSplittable) 1209 IsDest = UI->BeginOffset == MTO.DestBegin; 1210 else 1211 IsDest = MTO.DestBegin != 0u; 1212 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1213 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1214 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1215 } 1216 } 1217} 1218 1219void AllocaPartitioning::print(raw_ostream &OS) const { 1220 if (PointerEscapingInstr) { 1221 OS << "No partitioning for alloca: " << AI << "\n" 1222 << " A pointer to this alloca escaped by:\n" 1223 << " " << *PointerEscapingInstr << "\n"; 1224 return; 1225 } 1226 1227 OS << "Partitioning of alloca: " << AI << "\n"; 1228 unsigned Num = 0; 1229 for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { 1230 print(OS, I); 1231 printUsers(OS, I); 1232 } 1233} 1234 1235void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1236void AllocaPartitioning::dump() const { print(dbgs()); } 1237 1238#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1239 1240 1241namespace { 1242/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1243/// 1244/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1245/// the loads and stores of an alloca instruction, as well as updating its 1246/// debug information. This is used when a domtree is unavailable and thus 1247/// mem2reg in its full form can't be used to handle promotion of allocas to 1248/// scalar values. 1249class AllocaPromoter : public LoadAndStorePromoter { 1250 AllocaInst &AI; 1251 DIBuilder &DIB; 1252 1253 SmallVector<DbgDeclareInst *, 4> DDIs; 1254 SmallVector<DbgValueInst *, 4> DVIs; 1255 1256public: 1257 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1258 AllocaInst &AI, DIBuilder &DIB) 1259 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1260 1261 void run(const SmallVectorImpl<Instruction*> &Insts) { 1262 // Remember which alloca we're promoting (for isInstInList). 1263 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1264 for (Value::use_iterator UI = DebugNode->use_begin(), 1265 UE = DebugNode->use_end(); 1266 UI != UE; ++UI) 1267 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1268 DDIs.push_back(DDI); 1269 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1270 DVIs.push_back(DVI); 1271 } 1272 1273 LoadAndStorePromoter::run(Insts); 1274 AI.eraseFromParent(); 1275 while (!DDIs.empty()) 1276 DDIs.pop_back_val()->eraseFromParent(); 1277 while (!DVIs.empty()) 1278 DVIs.pop_back_val()->eraseFromParent(); 1279 } 1280 1281 virtual bool isInstInList(Instruction *I, 1282 const SmallVectorImpl<Instruction*> &Insts) const { 1283 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1284 return LI->getOperand(0) == &AI; 1285 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1286 } 1287 1288 virtual void updateDebugInfo(Instruction *Inst) const { 1289 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1290 E = DDIs.end(); I != E; ++I) { 1291 DbgDeclareInst *DDI = *I; 1292 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1293 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1294 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1295 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1296 } 1297 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1298 E = DVIs.end(); I != E; ++I) { 1299 DbgValueInst *DVI = *I; 1300 Value *Arg = NULL; 1301 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1302 // If an argument is zero extended then use argument directly. The ZExt 1303 // may be zapped by an optimization pass in future. 1304 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1305 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1306 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1307 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1308 if (!Arg) 1309 Arg = SI->getOperand(0); 1310 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1311 Arg = LI->getOperand(0); 1312 } else { 1313 continue; 1314 } 1315 Instruction *DbgVal = 1316 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1317 Inst); 1318 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1319 } 1320 } 1321}; 1322} // end anon namespace 1323 1324 1325namespace { 1326/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1327/// 1328/// This pass takes allocations which can be completely analyzed (that is, they 1329/// don't escape) and tries to turn them into scalar SSA values. There are 1330/// a few steps to this process. 1331/// 1332/// 1) It takes allocations of aggregates and analyzes the ways in which they 1333/// are used to try to split them into smaller allocations, ideally of 1334/// a single scalar data type. It will split up memcpy and memset accesses 1335/// as necessary and try to isolate invidual scalar accesses. 1336/// 2) It will transform accesses into forms which are suitable for SSA value 1337/// promotion. This can be replacing a memset with a scalar store of an 1338/// integer value, or it can involve speculating operations on a PHI or 1339/// select to be a PHI or select of the results. 1340/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1341/// onto insert and extract operations on a vector value, and convert them to 1342/// this form. By doing so, it will enable promotion of vector aggregates to 1343/// SSA vector values. 1344class SROA : public FunctionPass { 1345 const bool RequiresDomTree; 1346 1347 LLVMContext *C; 1348 const DataLayout *TD; 1349 DominatorTree *DT; 1350 1351 /// \brief Worklist of alloca instructions to simplify. 1352 /// 1353 /// Each alloca in the function is added to this. Each new alloca formed gets 1354 /// added to it as well to recursively simplify unless that alloca can be 1355 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1356 /// the one being actively rewritten, we add it back onto the list if not 1357 /// already present to ensure it is re-visited. 1358 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1359 1360 /// \brief A collection of instructions to delete. 1361 /// We try to batch deletions to simplify code and make things a bit more 1362 /// efficient. 1363 SmallVector<Instruction *, 8> DeadInsts; 1364 1365 /// \brief A set to prevent repeatedly marking an instruction split into many 1366 /// uses as dead. Only used to guard insertion into DeadInsts. 1367 SmallPtrSet<Instruction *, 4> DeadSplitInsts; 1368 1369 /// \brief Post-promotion worklist. 1370 /// 1371 /// Sometimes we discover an alloca which has a high probability of becoming 1372 /// viable for SROA after a round of promotion takes place. In those cases, 1373 /// the alloca is enqueued here for re-processing. 1374 /// 1375 /// Note that we have to be very careful to clear allocas out of this list in 1376 /// the event they are deleted. 1377 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1378 1379 /// \brief A collection of alloca instructions we can directly promote. 1380 std::vector<AllocaInst *> PromotableAllocas; 1381 1382public: 1383 SROA(bool RequiresDomTree = true) 1384 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1385 C(0), TD(0), DT(0) { 1386 initializeSROAPass(*PassRegistry::getPassRegistry()); 1387 } 1388 bool runOnFunction(Function &F); 1389 void getAnalysisUsage(AnalysisUsage &AU) const; 1390 1391 const char *getPassName() const { return "SROA"; } 1392 static char ID; 1393 1394private: 1395 friend class PHIOrSelectSpeculator; 1396 friend class AllocaPartitionRewriter; 1397 friend class AllocaPartitionVectorRewriter; 1398 1399 bool rewriteAllocaPartition(AllocaInst &AI, 1400 AllocaPartitioning &P, 1401 AllocaPartitioning::iterator PI); 1402 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1403 bool runOnAlloca(AllocaInst &AI); 1404 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1405 bool promoteAllocas(Function &F); 1406}; 1407} 1408 1409char SROA::ID = 0; 1410 1411FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1412 return new SROA(RequiresDomTree); 1413} 1414 1415INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1416 false, false) 1417INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1418INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1419 false, false) 1420 1421namespace { 1422/// \brief Visitor to speculate PHIs and Selects where possible. 1423class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1424 // Befriend the base class so it can delegate to private visit methods. 1425 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1426 1427 const DataLayout &TD; 1428 AllocaPartitioning &P; 1429 SROA &Pass; 1430 1431public: 1432 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1433 : TD(TD), P(P), Pass(Pass) {} 1434 1435 /// \brief Visit the users of an alloca partition and rewrite them. 1436 void visitUsers(AllocaPartitioning::const_iterator PI) { 1437 // Note that we need to use an index here as the underlying vector of uses 1438 // may be grown during speculation. However, we never need to re-visit the 1439 // new uses, and so we can use the initial size bound. 1440 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1441 const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); 1442 if (!PU.U) 1443 continue; // Skip dead use. 1444 1445 visit(cast<Instruction>(PU.U->getUser())); 1446 } 1447 } 1448 1449private: 1450 // By default, skip this instruction. 1451 void visitInstruction(Instruction &I) {} 1452 1453 /// PHI instructions that use an alloca and are subsequently loaded can be 1454 /// rewritten to load both input pointers in the pred blocks and then PHI the 1455 /// results, allowing the load of the alloca to be promoted. 1456 /// From this: 1457 /// %P2 = phi [i32* %Alloca, i32* %Other] 1458 /// %V = load i32* %P2 1459 /// to: 1460 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1461 /// ... 1462 /// %V2 = load i32* %Other 1463 /// ... 1464 /// %V = phi [i32 %V1, i32 %V2] 1465 /// 1466 /// We can do this to a select if its only uses are loads and if the operands 1467 /// to the select can be loaded unconditionally. 1468 /// 1469 /// FIXME: This should be hoisted into a generic utility, likely in 1470 /// Transforms/Util/Local.h 1471 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1472 // For now, we can only do this promotion if the load is in the same block 1473 // as the PHI, and if there are no stores between the phi and load. 1474 // TODO: Allow recursive phi users. 1475 // TODO: Allow stores. 1476 BasicBlock *BB = PN.getParent(); 1477 unsigned MaxAlign = 0; 1478 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1479 UI != UE; ++UI) { 1480 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1481 if (LI == 0 || !LI->isSimple()) return false; 1482 1483 // For now we only allow loads in the same block as the PHI. This is 1484 // a common case that happens when instcombine merges two loads through 1485 // a PHI. 1486 if (LI->getParent() != BB) return false; 1487 1488 // Ensure that there are no instructions between the PHI and the load that 1489 // could store. 1490 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1491 if (BBI->mayWriteToMemory()) 1492 return false; 1493 1494 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1495 Loads.push_back(LI); 1496 } 1497 1498 // We can only transform this if it is safe to push the loads into the 1499 // predecessor blocks. The only thing to watch out for is that we can't put 1500 // a possibly trapping load in the predecessor if it is a critical edge. 1501 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; 1502 ++Idx) { 1503 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1504 Value *InVal = PN.getIncomingValue(Idx); 1505 1506 // If the value is produced by the terminator of the predecessor (an 1507 // invoke) or it has side-effects, there is no valid place to put a load 1508 // in the predecessor. 1509 if (TI == InVal || TI->mayHaveSideEffects()) 1510 return false; 1511 1512 // If the predecessor has a single successor, then the edge isn't 1513 // critical. 1514 if (TI->getNumSuccessors() == 1) 1515 continue; 1516 1517 // If this pointer is always safe to load, or if we can prove that there 1518 // is already a load in the block, then we can move the load to the pred 1519 // block. 1520 if (InVal->isDereferenceablePointer() || 1521 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1522 continue; 1523 1524 return false; 1525 } 1526 1527 return true; 1528 } 1529 1530 void visitPHINode(PHINode &PN) { 1531 DEBUG(dbgs() << " original: " << PN << "\n"); 1532 1533 SmallVector<LoadInst *, 4> Loads; 1534 if (!isSafePHIToSpeculate(PN, Loads)) 1535 return; 1536 1537 assert(!Loads.empty()); 1538 1539 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1540 IRBuilder<> PHIBuilder(&PN); 1541 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1542 PN.getName() + ".sroa.speculated"); 1543 1544 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1545 // matter which one we get and if any differ, it doesn't matter. 1546 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1547 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1548 unsigned Align = SomeLoad->getAlignment(); 1549 1550 // Rewrite all loads of the PN to use the new PHI. 1551 do { 1552 LoadInst *LI = Loads.pop_back_val(); 1553 LI->replaceAllUsesWith(NewPN); 1554 Pass.DeadInsts.push_back(LI); 1555 } while (!Loads.empty()); 1556 1557 // Inject loads into all of the pred blocks. 1558 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1559 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1560 TerminatorInst *TI = Pred->getTerminator(); 1561 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1562 Value *InVal = PN.getIncomingValue(Idx); 1563 IRBuilder<> PredBuilder(TI); 1564 1565 LoadInst *Load 1566 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1567 Pred->getName())); 1568 ++NumLoadsSpeculated; 1569 Load->setAlignment(Align); 1570 if (TBAATag) 1571 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1572 NewPN->addIncoming(Load, Pred); 1573 1574 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1575 if (!Ptr) 1576 // No uses to rewrite. 1577 continue; 1578 1579 // Try to lookup and rewrite any partition uses corresponding to this phi 1580 // input. 1581 AllocaPartitioning::iterator PI 1582 = P.findPartitionForPHIOrSelectOperand(InUse); 1583 if (PI == P.end()) 1584 continue; 1585 1586 // Replace the Use in the PartitionUse for this operand with the Use 1587 // inside the load. 1588 AllocaPartitioning::use_iterator UI 1589 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1590 assert(isa<PHINode>(*UI->U->getUser())); 1591 UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); 1592 } 1593 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1594 } 1595 1596 /// Select instructions that use an alloca and are subsequently loaded can be 1597 /// rewritten to load both input pointers and then select between the result, 1598 /// allowing the load of the alloca to be promoted. 1599 /// From this: 1600 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1601 /// %V = load i32* %P2 1602 /// to: 1603 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1604 /// %V2 = load i32* %Other 1605 /// %V = select i1 %cond, i32 %V1, i32 %V2 1606 /// 1607 /// We can do this to a select if its only uses are loads and if the operand 1608 /// to the select can be loaded unconditionally. 1609 bool isSafeSelectToSpeculate(SelectInst &SI, 1610 SmallVectorImpl<LoadInst *> &Loads) { 1611 Value *TValue = SI.getTrueValue(); 1612 Value *FValue = SI.getFalseValue(); 1613 bool TDerefable = TValue->isDereferenceablePointer(); 1614 bool FDerefable = FValue->isDereferenceablePointer(); 1615 1616 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1617 UI != UE; ++UI) { 1618 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1619 if (LI == 0 || !LI->isSimple()) return false; 1620 1621 // Both operands to the select need to be dereferencable, either 1622 // absolutely (e.g. allocas) or at this point because we can see other 1623 // accesses to it. 1624 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1625 LI->getAlignment(), &TD)) 1626 return false; 1627 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1628 LI->getAlignment(), &TD)) 1629 return false; 1630 Loads.push_back(LI); 1631 } 1632 1633 return true; 1634 } 1635 1636 void visitSelectInst(SelectInst &SI) { 1637 DEBUG(dbgs() << " original: " << SI << "\n"); 1638 IRBuilder<> IRB(&SI); 1639 1640 // If the select isn't safe to speculate, just use simple logic to emit it. 1641 SmallVector<LoadInst *, 4> Loads; 1642 if (!isSafeSelectToSpeculate(SI, Loads)) 1643 return; 1644 1645 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1646 AllocaPartitioning::iterator PIs[2]; 1647 AllocaPartitioning::PartitionUse PUs[2]; 1648 for (unsigned i = 0, e = 2; i != e; ++i) { 1649 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1650 if (PIs[i] != P.end()) { 1651 // If the pointer is within the partitioning, remove the select from 1652 // its uses. We'll add in the new loads below. 1653 AllocaPartitioning::use_iterator UI 1654 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1655 PUs[i] = *UI; 1656 // Clear out the use here so that the offsets into the use list remain 1657 // stable but this use is ignored when rewriting. 1658 UI->U = 0; 1659 } 1660 } 1661 1662 Value *TV = SI.getTrueValue(); 1663 Value *FV = SI.getFalseValue(); 1664 // Replace the loads of the select with a select of two loads. 1665 while (!Loads.empty()) { 1666 LoadInst *LI = Loads.pop_back_val(); 1667 1668 IRB.SetInsertPoint(LI); 1669 LoadInst *TL = 1670 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1671 LoadInst *FL = 1672 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1673 NumLoadsSpeculated += 2; 1674 1675 // Transfer alignment and TBAA info if present. 1676 TL->setAlignment(LI->getAlignment()); 1677 FL->setAlignment(LI->getAlignment()); 1678 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1679 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1680 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1681 } 1682 1683 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1684 LI->getName() + ".sroa.speculated"); 1685 1686 LoadInst *Loads[2] = { TL, FL }; 1687 for (unsigned i = 0, e = 2; i != e; ++i) { 1688 if (PIs[i] != P.end()) { 1689 Use *LoadUse = &Loads[i]->getOperandUse(0); 1690 assert(PUs[i].U->get() == LoadUse->get()); 1691 PUs[i].U = LoadUse; 1692 P.use_push_back(PIs[i], PUs[i]); 1693 } 1694 } 1695 1696 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1697 LI->replaceAllUsesWith(V); 1698 Pass.DeadInsts.push_back(LI); 1699 } 1700 } 1701}; 1702} 1703 1704/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. 1705/// 1706/// If the provided GEP is all-constant, the total byte offset formed by the 1707/// GEP is computed and Offset is set to it. If the GEP has any non-constant 1708/// operands, the function returns false and the value of Offset is unmodified. 1709static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP, 1710 APInt &Offset) { 1711 APInt GEPOffset(Offset.getBitWidth(), 0); 1712 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 1713 GTI != GTE; ++GTI) { 1714 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1715 if (!OpC) 1716 return false; 1717 if (OpC->isZero()) continue; 1718 1719 // Handle a struct index, which adds its field offset to the pointer. 1720 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 1721 unsigned ElementIdx = OpC->getZExtValue(); 1722 const StructLayout *SL = TD.getStructLayout(STy); 1723 GEPOffset += APInt(Offset.getBitWidth(), 1724 SL->getElementOffset(ElementIdx)); 1725 continue; 1726 } 1727 1728 APInt TypeSize(Offset.getBitWidth(), 1729 TD.getTypeAllocSize(GTI.getIndexedType())); 1730 if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) { 1731 assert((VTy->getScalarSizeInBits() % 8) == 0 && 1732 "vector element size is not a multiple of 8, cannot GEP over it"); 1733 TypeSize = VTy->getScalarSizeInBits() / 8; 1734 } 1735 1736 GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; 1737 } 1738 Offset = GEPOffset; 1739 return true; 1740} 1741 1742/// \brief Build a GEP out of a base pointer and indices. 1743/// 1744/// This will return the BasePtr if that is valid, or build a new GEP 1745/// instruction using the IRBuilder if GEP-ing is needed. 1746static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1747 SmallVectorImpl<Value *> &Indices, 1748 const Twine &Prefix) { 1749 if (Indices.empty()) 1750 return BasePtr; 1751 1752 // A single zero index is a no-op, so check for this and avoid building a GEP 1753 // in that case. 1754 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1755 return BasePtr; 1756 1757 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1758} 1759 1760/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1761/// TargetTy without changing the offset of the pointer. 1762/// 1763/// This routine assumes we've already established a properly offset GEP with 1764/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1765/// zero-indices down through type layers until we find one the same as 1766/// TargetTy. If we can't find one with the same type, we at least try to use 1767/// one with the same size. If none of that works, we just produce the GEP as 1768/// indicated by Indices to have the correct offset. 1769static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, 1770 Value *BasePtr, Type *Ty, Type *TargetTy, 1771 SmallVectorImpl<Value *> &Indices, 1772 const Twine &Prefix) { 1773 if (Ty == TargetTy) 1774 return buildGEP(IRB, BasePtr, Indices, Prefix); 1775 1776 // See if we can descend into a struct and locate a field with the correct 1777 // type. 1778 unsigned NumLayers = 0; 1779 Type *ElementTy = Ty; 1780 do { 1781 if (ElementTy->isPointerTy()) 1782 break; 1783 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1784 ElementTy = SeqTy->getElementType(); 1785 // Note that we use the default address space as this index is over an 1786 // array or a vector, not a pointer. 1787 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); 1788 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1789 if (STy->element_begin() == STy->element_end()) 1790 break; // Nothing left to descend into. 1791 ElementTy = *STy->element_begin(); 1792 Indices.push_back(IRB.getInt32(0)); 1793 } else { 1794 break; 1795 } 1796 ++NumLayers; 1797 } while (ElementTy != TargetTy); 1798 if (ElementTy != TargetTy) 1799 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1800 1801 return buildGEP(IRB, BasePtr, Indices, Prefix); 1802} 1803 1804/// \brief Recursively compute indices for a natural GEP. 1805/// 1806/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1807/// element types adding appropriate indices for the GEP. 1808static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, 1809 Value *Ptr, Type *Ty, APInt &Offset, 1810 Type *TargetTy, 1811 SmallVectorImpl<Value *> &Indices, 1812 const Twine &Prefix) { 1813 if (Offset == 0) 1814 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1815 1816 // We can't recurse through pointer types. 1817 if (Ty->isPointerTy()) 1818 return 0; 1819 1820 // We try to analyze GEPs over vectors here, but note that these GEPs are 1821 // extremely poorly defined currently. The long-term goal is to remove GEPing 1822 // over a vector from the IR completely. 1823 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1824 unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); 1825 if (ElementSizeInBits % 8) 1826 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1827 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1828 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1829 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1830 return 0; 1831 Offset -= NumSkippedElements * ElementSize; 1832 Indices.push_back(IRB.getInt(NumSkippedElements)); 1833 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1834 Offset, TargetTy, Indices, Prefix); 1835 } 1836 1837 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1838 Type *ElementTy = ArrTy->getElementType(); 1839 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1840 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1841 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1842 return 0; 1843 1844 Offset -= NumSkippedElements * ElementSize; 1845 Indices.push_back(IRB.getInt(NumSkippedElements)); 1846 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1847 Indices, Prefix); 1848 } 1849 1850 StructType *STy = dyn_cast<StructType>(Ty); 1851 if (!STy) 1852 return 0; 1853 1854 const StructLayout *SL = TD.getStructLayout(STy); 1855 uint64_t StructOffset = Offset.getZExtValue(); 1856 if (StructOffset >= SL->getSizeInBytes()) 1857 return 0; 1858 unsigned Index = SL->getElementContainingOffset(StructOffset); 1859 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1860 Type *ElementTy = STy->getElementType(Index); 1861 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1862 return 0; // The offset points into alignment padding. 1863 1864 Indices.push_back(IRB.getInt32(Index)); 1865 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1866 Indices, Prefix); 1867} 1868 1869/// \brief Get a natural GEP from a base pointer to a particular offset and 1870/// resulting in a particular type. 1871/// 1872/// The goal is to produce a "natural" looking GEP that works with the existing 1873/// composite types to arrive at the appropriate offset and element type for 1874/// a pointer. TargetTy is the element type the returned GEP should point-to if 1875/// possible. We recurse by decreasing Offset, adding the appropriate index to 1876/// Indices, and setting Ty to the result subtype. 1877/// 1878/// If no natural GEP can be constructed, this function returns null. 1879static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, 1880 Value *Ptr, APInt Offset, Type *TargetTy, 1881 SmallVectorImpl<Value *> &Indices, 1882 const Twine &Prefix) { 1883 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1884 1885 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1886 // an i8. 1887 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1888 return 0; 1889 1890 Type *ElementTy = Ty->getElementType(); 1891 if (!ElementTy->isSized()) 1892 return 0; // We can't GEP through an unsized element. 1893 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1894 if (ElementSize == 0) 1895 return 0; // Zero-length arrays can't help us build a natural GEP. 1896 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1897 1898 Offset -= NumSkippedElements * ElementSize; 1899 Indices.push_back(IRB.getInt(NumSkippedElements)); 1900 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1901 Indices, Prefix); 1902} 1903 1904/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1905/// resulting pointer has PointerTy. 1906/// 1907/// This tries very hard to compute a "natural" GEP which arrives at the offset 1908/// and produces the pointer type desired. Where it cannot, it will try to use 1909/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1910/// fails, it will try to use an existing i8* and GEP to the byte offset and 1911/// bitcast to the type. 1912/// 1913/// The strategy for finding the more natural GEPs is to peel off layers of the 1914/// pointer, walking back through bit casts and GEPs, searching for a base 1915/// pointer from which we can compute a natural GEP with the desired 1916/// properities. The algorithm tries to fold as many constant indices into 1917/// a single GEP as possible, thus making each GEP more independent of the 1918/// surrounding code. 1919static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, 1920 Value *Ptr, APInt Offset, Type *PointerTy, 1921 const Twine &Prefix) { 1922 // Even though we don't look through PHI nodes, we could be called on an 1923 // instruction in an unreachable block, which may be on a cycle. 1924 SmallPtrSet<Value *, 4> Visited; 1925 Visited.insert(Ptr); 1926 SmallVector<Value *, 4> Indices; 1927 1928 // We may end up computing an offset pointer that has the wrong type. If we 1929 // never are able to compute one directly that has the correct type, we'll 1930 // fall back to it, so keep it around here. 1931 Value *OffsetPtr = 0; 1932 1933 // Remember any i8 pointer we come across to re-use if we need to do a raw 1934 // byte offset. 1935 Value *Int8Ptr = 0; 1936 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1937 1938 Type *TargetTy = PointerTy->getPointerElementType(); 1939 1940 do { 1941 // First fold any existing GEPs into the offset. 1942 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1943 APInt GEPOffset(Offset.getBitWidth(), 0); 1944 if (!accumulateGEPOffsets(TD, *GEP, GEPOffset)) 1945 break; 1946 Offset += GEPOffset; 1947 Ptr = GEP->getPointerOperand(); 1948 if (!Visited.insert(Ptr)) 1949 break; 1950 } 1951 1952 // See if we can perform a natural GEP here. 1953 Indices.clear(); 1954 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1955 Indices, Prefix)) { 1956 if (P->getType() == PointerTy) { 1957 // Zap any offset pointer that we ended up computing in previous rounds. 1958 if (OffsetPtr && OffsetPtr->use_empty()) 1959 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1960 I->eraseFromParent(); 1961 return P; 1962 } 1963 if (!OffsetPtr) { 1964 OffsetPtr = P; 1965 } 1966 } 1967 1968 // Stash this pointer if we've found an i8*. 1969 if (Ptr->getType()->isIntegerTy(8)) { 1970 Int8Ptr = Ptr; 1971 Int8PtrOffset = Offset; 1972 } 1973 1974 // Peel off a layer of the pointer and update the offset appropriately. 1975 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1976 Ptr = cast<Operator>(Ptr)->getOperand(0); 1977 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1978 if (GA->mayBeOverridden()) 1979 break; 1980 Ptr = GA->getAliasee(); 1981 } else { 1982 break; 1983 } 1984 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1985 } while (Visited.insert(Ptr)); 1986 1987 if (!OffsetPtr) { 1988 if (!Int8Ptr) { 1989 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1990 Prefix + ".raw_cast"); 1991 Int8PtrOffset = Offset; 1992 } 1993 1994 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1995 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1996 Prefix + ".raw_idx"); 1997 } 1998 Ptr = OffsetPtr; 1999 2000 // On the off chance we were targeting i8*, guard the bitcast here. 2001 if (Ptr->getType() != PointerTy) 2002 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 2003 2004 return Ptr; 2005} 2006 2007/// \brief Test whether we can convert a value from the old to the new type. 2008/// 2009/// This predicate should be used to guard calls to convertValue in order to 2010/// ensure that we only try to convert viable values. The strategy is that we 2011/// will peel off single element struct and array wrappings to get to an 2012/// underlying value, and convert that value. 2013static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 2014 if (OldTy == NewTy) 2015 return true; 2016 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 2017 return false; 2018 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 2019 return false; 2020 2021 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 2022 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 2023 return true; 2024 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 2025 return true; 2026 return false; 2027 } 2028 2029 return true; 2030} 2031 2032/// \brief Generic routine to convert an SSA value to a value of a different 2033/// type. 2034/// 2035/// This will try various different casting techniques, such as bitcasts, 2036/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 2037/// two types for viability with this routine. 2038static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2039 Type *Ty) { 2040 assert(canConvertValue(DL, V->getType(), Ty) && 2041 "Value not convertable to type"); 2042 if (V->getType() == Ty) 2043 return V; 2044 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 2045 return IRB.CreateIntToPtr(V, Ty); 2046 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 2047 return IRB.CreatePtrToInt(V, Ty); 2048 2049 return IRB.CreateBitCast(V, Ty); 2050} 2051 2052/// \brief Test whether the given alloca partition can be promoted to a vector. 2053/// 2054/// This is a quick test to check whether we can rewrite a particular alloca 2055/// partition (and its newly formed alloca) into a vector alloca with only 2056/// whole-vector loads and stores such that it could be promoted to a vector 2057/// SSA value. We only can ensure this for a limited set of operations, and we 2058/// don't want to do the rewrites unless we are confident that the result will 2059/// be promotable, so we have an early test here. 2060static bool isVectorPromotionViable(const DataLayout &TD, 2061 Type *AllocaTy, 2062 AllocaPartitioning &P, 2063 uint64_t PartitionBeginOffset, 2064 uint64_t PartitionEndOffset, 2065 AllocaPartitioning::const_use_iterator I, 2066 AllocaPartitioning::const_use_iterator E) { 2067 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 2068 if (!Ty) 2069 return false; 2070 2071 uint64_t VecSize = TD.getTypeSizeInBits(Ty); 2072 uint64_t ElementSize = Ty->getScalarSizeInBits(); 2073 2074 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2075 // that aren't byte sized. 2076 if (ElementSize % 8) 2077 return false; 2078 assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); 2079 VecSize /= 8; 2080 ElementSize /= 8; 2081 2082 for (; I != E; ++I) { 2083 if (!I->U) 2084 continue; // Skip dead use. 2085 2086 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 2087 uint64_t BeginIndex = BeginOffset / ElementSize; 2088 if (BeginIndex * ElementSize != BeginOffset || 2089 BeginIndex >= Ty->getNumElements()) 2090 return false; 2091 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 2092 uint64_t EndIndex = EndOffset / ElementSize; 2093 if (EndIndex * ElementSize != EndOffset || 2094 EndIndex > Ty->getNumElements()) 2095 return false; 2096 2097 // FIXME: We should build shuffle vector instructions to handle 2098 // non-element-sized accesses. 2099 if ((EndOffset - BeginOffset) != ElementSize && 2100 (EndOffset - BeginOffset) != VecSize) 2101 return false; 2102 2103 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2104 if (MI->isVolatile()) 2105 return false; 2106 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2107 const AllocaPartitioning::MemTransferOffsets &MTO 2108 = P.getMemTransferOffsets(*MTI); 2109 if (!MTO.IsSplittable) 2110 return false; 2111 } 2112 } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { 2113 // Disable vector promotion when there are loads or stores of an FCA. 2114 return false; 2115 } else if (!isa<LoadInst>(I->U->getUser()) && 2116 !isa<StoreInst>(I->U->getUser())) { 2117 return false; 2118 } 2119 } 2120 return true; 2121} 2122 2123/// \brief Test whether the given alloca partition's integer operations can be 2124/// widened to promotable ones. 2125/// 2126/// This is a quick test to check whether we can rewrite the integer loads and 2127/// stores to a particular alloca into wider loads and stores and be able to 2128/// promote the resulting alloca. 2129static bool isIntegerWideningViable(const DataLayout &TD, 2130 Type *AllocaTy, 2131 uint64_t AllocBeginOffset, 2132 AllocaPartitioning &P, 2133 AllocaPartitioning::const_use_iterator I, 2134 AllocaPartitioning::const_use_iterator E) { 2135 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2136 2137 // Don't try to handle allocas with bit-padding. 2138 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2139 return false; 2140 2141 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2142 2143 // Check the uses to ensure the uses are (likely) promoteable integer uses. 2144 // Also ensure that the alloca has a covering load or store. We don't want 2145 // to widen the integer operotains only to fail to promote due to some other 2146 // unsplittable entry (which we may make splittable later). 2147 bool WholeAllocaOp = false; 2148 for (; I != E; ++I) { 2149 if (!I->U) 2150 continue; // Skip dead use. 2151 2152 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2153 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2154 2155 // We can't reasonably handle cases where the load or store extends past 2156 // the end of the aloca's type and into its padding. 2157 if (RelEnd > Size) 2158 return false; 2159 2160 if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { 2161 if (LI->isVolatile()) 2162 return false; 2163 if (RelBegin == 0 && RelEnd == Size) 2164 WholeAllocaOp = true; 2165 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2166 if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) 2167 return false; 2168 continue; 2169 } 2170 // Non-integer loads need to be convertible from the alloca type so that 2171 // they are promotable. 2172 if (RelBegin != 0 || RelEnd != Size || 2173 !canConvertValue(TD, AllocaTy, LI->getType())) 2174 return false; 2175 } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { 2176 Type *ValueTy = SI->getValueOperand()->getType(); 2177 if (SI->isVolatile()) 2178 return false; 2179 if (RelBegin == 0 && RelEnd == Size) 2180 WholeAllocaOp = true; 2181 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2182 if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) 2183 return false; 2184 continue; 2185 } 2186 // Non-integer stores need to be convertible to the alloca type so that 2187 // they are promotable. 2188 if (RelBegin != 0 || RelEnd != Size || 2189 !canConvertValue(TD, ValueTy, AllocaTy)) 2190 return false; 2191 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { 2192 if (MI->isVolatile()) 2193 return false; 2194 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { 2195 const AllocaPartitioning::MemTransferOffsets &MTO 2196 = P.getMemTransferOffsets(*MTI); 2197 if (!MTO.IsSplittable) 2198 return false; 2199 } 2200 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { 2201 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2202 II->getIntrinsicID() != Intrinsic::lifetime_end) 2203 return false; 2204 } else { 2205 return false; 2206 } 2207 } 2208 return WholeAllocaOp; 2209} 2210 2211static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, 2212 IntegerType *Ty, uint64_t Offset, 2213 const Twine &Name) { 2214 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2215 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2216 "Element extends past full value"); 2217 uint64_t ShAmt = 8*Offset; 2218 if (DL.isBigEndian()) 2219 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2220 if (ShAmt) 2221 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2222 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2223 "Cannot extract to a larger integer!"); 2224 if (Ty != IntTy) 2225 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2226 return V; 2227} 2228 2229static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, 2230 Value *V, uint64_t Offset, const Twine &Name) { 2231 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2232 IntegerType *Ty = cast<IntegerType>(V->getType()); 2233 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2234 "Cannot insert a larger integer!"); 2235 if (Ty != IntTy) 2236 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2237 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2238 "Element store outside of alloca store"); 2239 uint64_t ShAmt = 8*Offset; 2240 if (DL.isBigEndian()) 2241 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2242 if (ShAmt) 2243 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2244 2245 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2246 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2247 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2248 V = IRB.CreateOr(Old, V, Name + ".insert"); 2249 } 2250 return V; 2251} 2252 2253namespace { 2254/// \brief Visitor to rewrite instructions using a partition of an alloca to 2255/// use a new alloca. 2256/// 2257/// Also implements the rewriting to vector-based accesses when the partition 2258/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2259/// lives here. 2260class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2261 bool> { 2262 // Befriend the base class so it can delegate to private visit methods. 2263 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2264 2265 const DataLayout &TD; 2266 AllocaPartitioning &P; 2267 SROA &Pass; 2268 AllocaInst &OldAI, &NewAI; 2269 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2270 Type *NewAllocaTy; 2271 2272 // If we are rewriting an alloca partition which can be written as pure 2273 // vector operations, we stash extra information here. When VecTy is 2274 // non-null, we have some strict guarantees about the rewriten alloca: 2275 // - The new alloca is exactly the size of the vector type here. 2276 // - The accesses all either map to the entire vector or to a single 2277 // element. 2278 // - The set of accessing instructions is only one of those handled above 2279 // in isVectorPromotionViable. Generally these are the same access kinds 2280 // which are promotable via mem2reg. 2281 VectorType *VecTy; 2282 Type *ElementTy; 2283 uint64_t ElementSize; 2284 2285 // This is a convenience and flag variable that will be null unless the new 2286 // alloca's integer operations should be widened to this integer type due to 2287 // passing isIntegerWideningViable above. If it is non-null, the desired 2288 // integer type will be stored here for easy access during rewriting. 2289 IntegerType *IntTy; 2290 2291 // The offset of the partition user currently being rewritten. 2292 uint64_t BeginOffset, EndOffset; 2293 Use *OldUse; 2294 Instruction *OldPtr; 2295 2296 // The name prefix to use when rewriting instructions for this alloca. 2297 std::string NamePrefix; 2298 2299public: 2300 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2301 AllocaPartitioning::iterator PI, 2302 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2303 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2304 : TD(TD), P(P), Pass(Pass), 2305 OldAI(OldAI), NewAI(NewAI), 2306 NewAllocaBeginOffset(NewBeginOffset), 2307 NewAllocaEndOffset(NewEndOffset), 2308 NewAllocaTy(NewAI.getAllocatedType()), 2309 VecTy(), ElementTy(), ElementSize(), IntTy(), 2310 BeginOffset(), EndOffset() { 2311 } 2312 2313 /// \brief Visit the users of the alloca partition and rewrite them. 2314 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2315 AllocaPartitioning::const_use_iterator E) { 2316 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2317 NewAllocaBeginOffset, NewAllocaEndOffset, 2318 I, E)) { 2319 ++NumVectorized; 2320 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2321 ElementTy = VecTy->getElementType(); 2322 assert((VecTy->getScalarSizeInBits() % 8) == 0 && 2323 "Only multiple-of-8 sized vector elements are viable"); 2324 ElementSize = VecTy->getScalarSizeInBits() / 8; 2325 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2326 NewAllocaBeginOffset, P, I, E)) { 2327 IntTy = Type::getIntNTy(NewAI.getContext(), 2328 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2329 } 2330 bool CanSROA = true; 2331 for (; I != E; ++I) { 2332 if (!I->U) 2333 continue; // Skip dead uses. 2334 BeginOffset = I->BeginOffset; 2335 EndOffset = I->EndOffset; 2336 OldUse = I->U; 2337 OldPtr = cast<Instruction>(I->U->get()); 2338 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2339 CanSROA &= visit(cast<Instruction>(I->U->getUser())); 2340 } 2341 if (VecTy) { 2342 assert(CanSROA); 2343 VecTy = 0; 2344 ElementTy = 0; 2345 ElementSize = 0; 2346 } 2347 if (IntTy) { 2348 assert(CanSROA); 2349 IntTy = 0; 2350 } 2351 return CanSROA; 2352 } 2353 2354private: 2355 // Every instruction which can end up as a user must have a rewrite rule. 2356 bool visitInstruction(Instruction &I) { 2357 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2358 llvm_unreachable("No rewrite rule for this instruction!"); 2359 } 2360 2361 Twine getName(const Twine &Suffix) { 2362 return NamePrefix + Suffix; 2363 } 2364 2365 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 2366 assert(BeginOffset >= NewAllocaBeginOffset); 2367 unsigned AS = cast<PointerType>(PointerTy)->getAddressSpace(); 2368 APInt Offset(TD.getPointerSizeInBits(AS), BeginOffset - NewAllocaBeginOffset); 2369 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2370 } 2371 2372 /// \brief Compute suitable alignment to access an offset into the new alloca. 2373 unsigned getOffsetAlign(uint64_t Offset) { 2374 unsigned NewAIAlign = NewAI.getAlignment(); 2375 if (!NewAIAlign) 2376 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2377 return MinAlign(NewAIAlign, Offset); 2378 } 2379 2380 /// \brief Compute suitable alignment to access this partition of the new 2381 /// alloca. 2382 unsigned getPartitionAlign() { 2383 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2384 } 2385 2386 /// \brief Compute suitable alignment to access a type at an offset of the 2387 /// new alloca. 2388 /// 2389 /// \returns zero if the type's ABI alignment is a suitable alignment, 2390 /// otherwise returns the maximal suitable alignment. 2391 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2392 unsigned Align = getOffsetAlign(Offset); 2393 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2394 } 2395 2396 /// \brief Compute suitable alignment to access a type at the beginning of 2397 /// this partition of the new alloca. 2398 /// 2399 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2400 unsigned getPartitionTypeAlign(Type *Ty) { 2401 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2402 } 2403 2404 ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { 2405 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2406 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2407 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2408 uint32_t Index = RelOffset / ElementSize; 2409 assert(Index * ElementSize == RelOffset); 2410 return IRB.getInt32(Index); 2411 } 2412 2413 void deleteIfTriviallyDead(Value *V) { 2414 Instruction *I = cast<Instruction>(V); 2415 if (isInstructionTriviallyDead(I)) 2416 Pass.DeadInsts.push_back(I); 2417 } 2418 2419 bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { 2420 Value *Result; 2421 if (LI.getType() == VecTy->getElementType() || 2422 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 2423 Result = IRB.CreateExtractElement( 2424 IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), 2425 getIndex(IRB, BeginOffset), getName(".extract")); 2426 } else { 2427 Result = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2428 getName(".load")); 2429 } 2430 if (Result->getType() != LI.getType()) 2431 Result = convertValue(TD, IRB, Result, LI.getType()); 2432 LI.replaceAllUsesWith(Result); 2433 Pass.DeadInsts.push_back(&LI); 2434 2435 DEBUG(dbgs() << " to: " << *Result << "\n"); 2436 return true; 2437 } 2438 2439 bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 2440 assert(IntTy && "We cannot insert an integer to the alloca"); 2441 assert(!LI.isVolatile()); 2442 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2443 getName(".load")); 2444 V = convertValue(TD, IRB, V, IntTy); 2445 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2446 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2447 V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2448 getName(".extract")); 2449 LI.replaceAllUsesWith(V); 2450 Pass.DeadInsts.push_back(&LI); 2451 DEBUG(dbgs() << " to: " << *V << "\n"); 2452 return true; 2453 } 2454 2455 bool visitLoadInst(LoadInst &LI) { 2456 DEBUG(dbgs() << " original: " << LI << "\n"); 2457 Value *OldOp = LI.getOperand(0); 2458 assert(OldOp == OldPtr); 2459 IRBuilder<> IRB(&LI); 2460 2461 if (VecTy) 2462 return rewriteVectorizedLoadInst(IRB, LI, OldOp); 2463 if (IntTy && LI.getType()->isIntegerTy()) 2464 return rewriteIntegerLoad(IRB, LI); 2465 2466 if (BeginOffset == NewAllocaBeginOffset && 2467 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2468 Value *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2469 LI.isVolatile(), getName(".load")); 2470 Value *NewV = convertValue(TD, IRB, NewLI, LI.getType()); 2471 LI.replaceAllUsesWith(NewV); 2472 Pass.DeadInsts.push_back(&LI); 2473 2474 DEBUG(dbgs() << " to: " << *NewLI << "\n"); 2475 return !LI.isVolatile(); 2476 } 2477 2478 assert(!IntTy && "Invalid load found with int-op widening enabled"); 2479 2480 Value *NewPtr = getAdjustedAllocaPtr(IRB, 2481 LI.getPointerOperand()->getType()); 2482 LI.setOperand(0, NewPtr); 2483 LI.setAlignment(getPartitionTypeAlign(LI.getType())); 2484 DEBUG(dbgs() << " to: " << LI << "\n"); 2485 2486 deleteIfTriviallyDead(OldOp); 2487 return NewPtr == &NewAI && !LI.isVolatile(); 2488 } 2489 2490 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI, 2491 Value *OldOp) { 2492 Value *V = SI.getValueOperand(); 2493 if (V->getType() == ElementTy || 2494 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 2495 if (V->getType() != ElementTy) 2496 V = convertValue(TD, IRB, V, ElementTy); 2497 LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2498 getName(".load")); 2499 V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset), 2500 getName(".insert")); 2501 } else if (V->getType() != VecTy) { 2502 V = convertValue(TD, IRB, V, VecTy); 2503 } 2504 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2505 Pass.DeadInsts.push_back(&SI); 2506 2507 (void)Store; 2508 DEBUG(dbgs() << " to: " << *Store << "\n"); 2509 return true; 2510 } 2511 2512 bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { 2513 assert(IntTy && "We cannot extract an integer from the alloca"); 2514 assert(!SI.isVolatile()); 2515 Value *V = SI.getValueOperand(); 2516 if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2517 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2518 getName(".oldload")); 2519 Old = convertValue(TD, IRB, Old, IntTy); 2520 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2521 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2522 V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, 2523 getName(".insert")); 2524 } 2525 V = convertValue(TD, IRB, V, NewAllocaTy); 2526 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2527 Pass.DeadInsts.push_back(&SI); 2528 (void)Store; 2529 DEBUG(dbgs() << " to: " << *Store << "\n"); 2530 return true; 2531 } 2532 2533 bool visitStoreInst(StoreInst &SI) { 2534 DEBUG(dbgs() << " original: " << SI << "\n"); 2535 Value *OldOp = SI.getOperand(1); 2536 assert(OldOp == OldPtr); 2537 IRBuilder<> IRB(&SI); 2538 2539 if (VecTy) 2540 return rewriteVectorizedStoreInst(IRB, SI, OldOp); 2541 Type *ValueTy = SI.getValueOperand()->getType(); 2542 if (IntTy && ValueTy->isIntegerTy()) 2543 return rewriteIntegerStore(IRB, SI); 2544 2545 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2546 // alloca that should be re-examined after promoting this alloca. 2547 if (ValueTy->isPointerTy()) 2548 if (AllocaInst *AI = dyn_cast<AllocaInst>(SI.getValueOperand() 2549 ->stripInBoundsOffsets())) 2550 Pass.PostPromotionWorklist.insert(AI); 2551 2552 if (BeginOffset == NewAllocaBeginOffset && 2553 canConvertValue(TD, ValueTy, NewAllocaTy)) { 2554 Value *NewV = convertValue(TD, IRB, SI.getValueOperand(), NewAllocaTy); 2555 StoreInst *NewSI = IRB.CreateAlignedStore(NewV, &NewAI, NewAI.getAlignment(), 2556 SI.isVolatile()); 2557 (void)NewSI; 2558 Pass.DeadInsts.push_back(&SI); 2559 2560 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2561 return !SI.isVolatile(); 2562 } 2563 2564 assert(!IntTy && "Invalid store found with int-op widening enabled"); 2565 2566 Value *NewPtr = getAdjustedAllocaPtr(IRB, 2567 SI.getPointerOperand()->getType()); 2568 SI.setOperand(1, NewPtr); 2569 SI.setAlignment(getPartitionTypeAlign(SI.getValueOperand()->getType())); 2570 DEBUG(dbgs() << " to: " << SI << "\n"); 2571 2572 deleteIfTriviallyDead(OldOp); 2573 return NewPtr == &NewAI && !SI.isVolatile(); 2574 } 2575 2576 bool visitMemSetInst(MemSetInst &II) { 2577 DEBUG(dbgs() << " original: " << II << "\n"); 2578 IRBuilder<> IRB(&II); 2579 assert(II.getRawDest() == OldPtr); 2580 2581 // If the memset has a variable size, it cannot be split, just adjust the 2582 // pointer to the new alloca. 2583 if (!isa<Constant>(II.getLength())) { 2584 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2585 Type *CstTy = II.getAlignmentCst()->getType(); 2586 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2587 2588 deleteIfTriviallyDead(OldPtr); 2589 return false; 2590 } 2591 2592 // Record this instruction for deletion. 2593 if (Pass.DeadSplitInsts.insert(&II)) 2594 Pass.DeadInsts.push_back(&II); 2595 2596 Type *AllocaTy = NewAI.getAllocatedType(); 2597 Type *ScalarTy = AllocaTy->getScalarType(); 2598 2599 // If this doesn't map cleanly onto the alloca type, and that type isn't 2600 // a single value type, just emit a memset. 2601 if (!VecTy && !IntTy && 2602 (BeginOffset != NewAllocaBeginOffset || 2603 EndOffset != NewAllocaEndOffset || 2604 !AllocaTy->isSingleValueType() || 2605 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { 2606 Type *SizeTy = II.getLength()->getType(); 2607 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2608 CallInst *New 2609 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2610 II.getRawDest()->getType()), 2611 II.getValue(), Size, getPartitionAlign(), 2612 II.isVolatile()); 2613 (void)New; 2614 DEBUG(dbgs() << " to: " << *New << "\n"); 2615 return false; 2616 } 2617 2618 // If we can represent this as a simple value, we have to build the actual 2619 // value to store, which requires expanding the byte present in memset to 2620 // a sensible representation for the alloca type. This is essentially 2621 // splatting the byte to a sufficiently wide integer, bitcasting to the 2622 // desired scalar type, and splatting it across any desired vector type. 2623 uint64_t Size = EndOffset - BeginOffset; 2624 Value *V = II.getValue(); 2625 IntegerType *VTy = cast<IntegerType>(V->getType()); 2626 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2627 if (Size*8 > VTy->getBitWidth()) 2628 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2629 ConstantExpr::getUDiv( 2630 Constant::getAllOnesValue(SplatIntTy), 2631 ConstantExpr::getZExt( 2632 Constant::getAllOnesValue(V->getType()), 2633 SplatIntTy)), 2634 getName(".isplat")); 2635 2636 // If this is an element-wide memset of a vectorizable alloca, insert it. 2637 if (VecTy && (BeginOffset > NewAllocaBeginOffset || 2638 EndOffset < NewAllocaEndOffset)) { 2639 if (V->getType() != ScalarTy) 2640 V = convertValue(TD, IRB, V, ScalarTy); 2641 StoreInst *Store = IRB.CreateAlignedStore( 2642 IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, 2643 NewAI.getAlignment(), 2644 getName(".load")), 2645 V, getIndex(IRB, BeginOffset), 2646 getName(".insert")), 2647 &NewAI, NewAI.getAlignment()); 2648 (void)Store; 2649 DEBUG(dbgs() << " to: " << *Store << "\n"); 2650 return true; 2651 } 2652 2653 // If this is a memset on an alloca where we can widen stores, insert the 2654 // set integer. 2655 if (IntTy && (BeginOffset > NewAllocaBeginOffset || 2656 EndOffset < NewAllocaEndOffset)) { 2657 assert(!II.isVolatile()); 2658 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2659 getName(".oldload")); 2660 Old = convertValue(TD, IRB, Old, IntTy); 2661 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2662 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2663 V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); 2664 } 2665 2666 if (V->getType() != AllocaTy) 2667 V = convertValue(TD, IRB, V, AllocaTy); 2668 2669 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2670 II.isVolatile()); 2671 (void)New; 2672 DEBUG(dbgs() << " to: " << *New << "\n"); 2673 return !II.isVolatile(); 2674 } 2675 2676 bool visitMemTransferInst(MemTransferInst &II) { 2677 // Rewriting of memory transfer instructions can be a bit tricky. We break 2678 // them into two categories: split intrinsics and unsplit intrinsics. 2679 2680 DEBUG(dbgs() << " original: " << II << "\n"); 2681 IRBuilder<> IRB(&II); 2682 2683 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2684 bool IsDest = II.getRawDest() == OldPtr; 2685 2686 const AllocaPartitioning::MemTransferOffsets &MTO 2687 = P.getMemTransferOffsets(II); 2688 2689 assert(OldPtr->getType()->isPointerTy() && "Must be a pointer type!"); 2690 unsigned AS = cast<PointerType>(OldPtr->getType())->getAddressSpace(); 2691 // Compute the relative offset within the transfer. 2692 unsigned IntPtrWidth = TD.getPointerSizeInBits(AS); 2693 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2694 : MTO.SourceBegin)); 2695 2696 unsigned Align = II.getAlignment(); 2697 if (Align > 1) 2698 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2699 MinAlign(II.getAlignment(), getPartitionAlign())); 2700 2701 // For unsplit intrinsics, we simply modify the source and destination 2702 // pointers in place. This isn't just an optimization, it is a matter of 2703 // correctness. With unsplit intrinsics we may be dealing with transfers 2704 // within a single alloca before SROA ran, or with transfers that have 2705 // a variable length. We may also be dealing with memmove instead of 2706 // memcpy, and so simply updating the pointers is the necessary for us to 2707 // update both source and dest of a single call. 2708 if (!MTO.IsSplittable) { 2709 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2710 if (IsDest) 2711 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2712 else 2713 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2714 2715 Type *CstTy = II.getAlignmentCst()->getType(); 2716 II.setAlignment(ConstantInt::get(CstTy, Align)); 2717 2718 DEBUG(dbgs() << " to: " << II << "\n"); 2719 deleteIfTriviallyDead(OldOp); 2720 return false; 2721 } 2722 // For split transfer intrinsics we have an incredibly useful assurance: 2723 // the source and destination do not reside within the same alloca, and at 2724 // least one of them does not escape. This means that we can replace 2725 // memmove with memcpy, and we don't need to worry about all manner of 2726 // downsides to splitting and transforming the operations. 2727 2728 // If this doesn't map cleanly onto the alloca type, and that type isn't 2729 // a single value type, just emit a memcpy. 2730 bool EmitMemCpy 2731 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2732 EndOffset != NewAllocaEndOffset || 2733 !NewAI.getAllocatedType()->isSingleValueType()); 2734 2735 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2736 // size hasn't been shrunk based on analysis of the viable range, this is 2737 // a no-op. 2738 if (EmitMemCpy && &OldAI == &NewAI) { 2739 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2740 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2741 // Ensure the start lines up. 2742 assert(BeginOffset == OrigBegin); 2743 (void)OrigBegin; 2744 2745 // Rewrite the size as needed. 2746 if (EndOffset != OrigEnd) 2747 II.setLength(ConstantInt::get(II.getLength()->getType(), 2748 EndOffset - BeginOffset)); 2749 return false; 2750 } 2751 // Record this instruction for deletion. 2752 if (Pass.DeadSplitInsts.insert(&II)) 2753 Pass.DeadInsts.push_back(&II); 2754 2755 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2756 EndOffset == NewAllocaEndOffset; 2757 bool IsVectorElement = VecTy && !IsWholeAlloca; 2758 uint64_t Size = EndOffset - BeginOffset; 2759 IntegerType *SubIntTy 2760 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2761 2762 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2763 : II.getRawDest()->getType(); 2764 if (!EmitMemCpy) { 2765 if (IsVectorElement) 2766 OtherPtrTy = VecTy->getElementType()->getPointerTo(); 2767 else if (IntTy && !IsWholeAlloca) 2768 OtherPtrTy = SubIntTy->getPointerTo(); 2769 else 2770 OtherPtrTy = NewAI.getType(); 2771 } 2772 2773 // Compute the other pointer, folding as much as possible to produce 2774 // a single, simple GEP in most cases. 2775 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2776 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2777 getName("." + OtherPtr->getName())); 2778 2779 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2780 // alloca that should be re-examined after rewriting this instruction. 2781 if (AllocaInst *AI 2782 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2783 Pass.Worklist.insert(AI); 2784 2785 if (EmitMemCpy) { 2786 Value *OurPtr 2787 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2788 : II.getRawSource()->getType()); 2789 Type *SizeTy = II.getLength()->getType(); 2790 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2791 2792 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2793 IsDest ? OtherPtr : OurPtr, 2794 Size, Align, II.isVolatile()); 2795 (void)New; 2796 DEBUG(dbgs() << " to: " << *New << "\n"); 2797 return false; 2798 } 2799 2800 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2801 // is equivalent to 1, but that isn't true if we end up rewriting this as 2802 // a load or store. 2803 if (!Align) 2804 Align = 1; 2805 2806 Value *SrcPtr = OtherPtr; 2807 Value *DstPtr = &NewAI; 2808 if (!IsDest) 2809 std::swap(SrcPtr, DstPtr); 2810 2811 Value *Src; 2812 if (IsVectorElement && !IsDest) { 2813 // We have to extract rather than load. 2814 Src = IRB.CreateExtractElement( 2815 IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), 2816 getIndex(IRB, BeginOffset), 2817 getName(".copyextract")); 2818 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2819 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2820 getName(".load")); 2821 Src = convertValue(TD, IRB, Src, IntTy); 2822 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2823 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2824 Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); 2825 } else { 2826 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2827 getName(".copyload")); 2828 } 2829 2830 if (IntTy && !IsWholeAlloca && IsDest) { 2831 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2832 getName(".oldload")); 2833 Old = convertValue(TD, IRB, Old, IntTy); 2834 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2835 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2836 Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); 2837 Src = convertValue(TD, IRB, Src, NewAllocaTy); 2838 } 2839 2840 if (IsVectorElement && IsDest) { 2841 // We have to insert into a loaded copy before storing. 2842 Src = IRB.CreateInsertElement( 2843 IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), 2844 Src, getIndex(IRB, BeginOffset), 2845 getName(".insert")); 2846 } 2847 2848 StoreInst *Store = cast<StoreInst>( 2849 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2850 (void)Store; 2851 DEBUG(dbgs() << " to: " << *Store << "\n"); 2852 return !II.isVolatile(); 2853 } 2854 2855 bool visitIntrinsicInst(IntrinsicInst &II) { 2856 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2857 II.getIntrinsicID() == Intrinsic::lifetime_end); 2858 DEBUG(dbgs() << " original: " << II << "\n"); 2859 IRBuilder<> IRB(&II); 2860 assert(II.getArgOperand(1) == OldPtr); 2861 2862 // Record this instruction for deletion. 2863 if (Pass.DeadSplitInsts.insert(&II)) 2864 Pass.DeadInsts.push_back(&II); 2865 2866 ConstantInt *Size 2867 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2868 EndOffset - BeginOffset); 2869 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2870 Value *New; 2871 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2872 New = IRB.CreateLifetimeStart(Ptr, Size); 2873 else 2874 New = IRB.CreateLifetimeEnd(Ptr, Size); 2875 2876 DEBUG(dbgs() << " to: " << *New << "\n"); 2877 return true; 2878 } 2879 2880 bool visitPHINode(PHINode &PN) { 2881 DEBUG(dbgs() << " original: " << PN << "\n"); 2882 2883 // We would like to compute a new pointer in only one place, but have it be 2884 // as local as possible to the PHI. To do that, we re-use the location of 2885 // the old pointer, which necessarily must be in the right position to 2886 // dominate the PHI. 2887 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2888 2889 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2890 // Replace the operands which were using the old pointer. 2891 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 2892 2893 DEBUG(dbgs() << " to: " << PN << "\n"); 2894 deleteIfTriviallyDead(OldPtr); 2895 return false; 2896 } 2897 2898 bool visitSelectInst(SelectInst &SI) { 2899 DEBUG(dbgs() << " original: " << SI << "\n"); 2900 IRBuilder<> IRB(&SI); 2901 2902 // Find the operand we need to rewrite here. 2903 bool IsTrueVal = SI.getTrueValue() == OldPtr; 2904 if (IsTrueVal) 2905 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 2906 else 2907 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 2908 2909 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 2910 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 2911 DEBUG(dbgs() << " to: " << SI << "\n"); 2912 deleteIfTriviallyDead(OldPtr); 2913 return false; 2914 } 2915 2916}; 2917} 2918 2919namespace { 2920/// \brief Visitor to rewrite aggregate loads and stores as scalar. 2921/// 2922/// This pass aggressively rewrites all aggregate loads and stores on 2923/// a particular pointer (or any pointer derived from it which we can identify) 2924/// with scalar loads and stores. 2925class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 2926 // Befriend the base class so it can delegate to private visit methods. 2927 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 2928 2929 const DataLayout &TD; 2930 2931 /// Queue of pointer uses to analyze and potentially rewrite. 2932 SmallVector<Use *, 8> Queue; 2933 2934 /// Set to prevent us from cycling with phi nodes and loops. 2935 SmallPtrSet<User *, 8> Visited; 2936 2937 /// The current pointer use being rewritten. This is used to dig up the used 2938 /// value (as opposed to the user). 2939 Use *U; 2940 2941public: 2942 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 2943 2944 /// Rewrite loads and stores through a pointer and all pointers derived from 2945 /// it. 2946 bool rewrite(Instruction &I) { 2947 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 2948 enqueueUsers(I); 2949 bool Changed = false; 2950 while (!Queue.empty()) { 2951 U = Queue.pop_back_val(); 2952 Changed |= visit(cast<Instruction>(U->getUser())); 2953 } 2954 return Changed; 2955 } 2956 2957private: 2958 /// Enqueue all the users of the given instruction for further processing. 2959 /// This uses a set to de-duplicate users. 2960 void enqueueUsers(Instruction &I) { 2961 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 2962 ++UI) 2963 if (Visited.insert(*UI)) 2964 Queue.push_back(&UI.getUse()); 2965 } 2966 2967 // Conservative default is to not rewrite anything. 2968 bool visitInstruction(Instruction &I) { return false; } 2969 2970 /// \brief Generic recursive split emission class. 2971 template <typename Derived> 2972 class OpSplitter { 2973 protected: 2974 /// The builder used to form new instructions. 2975 IRBuilder<> IRB; 2976 /// The indices which to be used with insert- or extractvalue to select the 2977 /// appropriate value within the aggregate. 2978 SmallVector<unsigned, 4> Indices; 2979 /// The indices to a GEP instruction which will move Ptr to the correct slot 2980 /// within the aggregate. 2981 SmallVector<Value *, 4> GEPIndices; 2982 /// The base pointer of the original op, used as a base for GEPing the 2983 /// split operations. 2984 Value *Ptr; 2985 2986 /// Initialize the splitter with an insertion point, Ptr and start with a 2987 /// single zero GEP index. 2988 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 2989 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 2990 2991 public: 2992 /// \brief Generic recursive split emission routine. 2993 /// 2994 /// This method recursively splits an aggregate op (load or store) into 2995 /// scalar or vector ops. It splits recursively until it hits a single value 2996 /// and emits that single value operation via the template argument. 2997 /// 2998 /// The logic of this routine relies on GEPs and insertvalue and 2999 /// extractvalue all operating with the same fundamental index list, merely 3000 /// formatted differently (GEPs need actual values). 3001 /// 3002 /// \param Ty The type being split recursively into smaller ops. 3003 /// \param Agg The aggregate value being built up or stored, depending on 3004 /// whether this is splitting a load or a store respectively. 3005 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3006 if (Ty->isSingleValueType()) 3007 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 3008 3009 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3010 unsigned OldSize = Indices.size(); 3011 (void)OldSize; 3012 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3013 ++Idx) { 3014 assert(Indices.size() == OldSize && "Did not return to the old size"); 3015 Indices.push_back(Idx); 3016 GEPIndices.push_back(IRB.getInt32(Idx)); 3017 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3018 GEPIndices.pop_back(); 3019 Indices.pop_back(); 3020 } 3021 return; 3022 } 3023 3024 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3025 unsigned OldSize = Indices.size(); 3026 (void)OldSize; 3027 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3028 ++Idx) { 3029 assert(Indices.size() == OldSize && "Did not return to the old size"); 3030 Indices.push_back(Idx); 3031 GEPIndices.push_back(IRB.getInt32(Idx)); 3032 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3033 GEPIndices.pop_back(); 3034 Indices.pop_back(); 3035 } 3036 return; 3037 } 3038 3039 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3040 } 3041 }; 3042 3043 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3044 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3045 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3046 3047 /// Emit a leaf load of a single value. This is called at the leaves of the 3048 /// recursive emission to actually load values. 3049 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3050 assert(Ty->isSingleValueType()); 3051 // Load the single value and insert it using the indices. 3052 Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, 3053 Name + ".gep"), 3054 Name + ".load"); 3055 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3056 DEBUG(dbgs() << " to: " << *Load << "\n"); 3057 } 3058 }; 3059 3060 bool visitLoadInst(LoadInst &LI) { 3061 assert(LI.getPointerOperand() == *U); 3062 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3063 return false; 3064 3065 // We have an aggregate being loaded, split it apart. 3066 DEBUG(dbgs() << " original: " << LI << "\n"); 3067 LoadOpSplitter Splitter(&LI, *U); 3068 Value *V = UndefValue::get(LI.getType()); 3069 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3070 LI.replaceAllUsesWith(V); 3071 LI.eraseFromParent(); 3072 return true; 3073 } 3074 3075 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3076 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3077 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3078 3079 /// Emit a leaf store of a single value. This is called at the leaves of the 3080 /// recursive emission to actually produce stores. 3081 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3082 assert(Ty->isSingleValueType()); 3083 // Extract the single value and store it using the indices. 3084 Value *Store = IRB.CreateStore( 3085 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3086 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3087 (void)Store; 3088 DEBUG(dbgs() << " to: " << *Store << "\n"); 3089 } 3090 }; 3091 3092 bool visitStoreInst(StoreInst &SI) { 3093 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3094 return false; 3095 Value *V = SI.getValueOperand(); 3096 if (V->getType()->isSingleValueType()) 3097 return false; 3098 3099 // We have an aggregate being stored, split it apart. 3100 DEBUG(dbgs() << " original: " << SI << "\n"); 3101 StoreOpSplitter Splitter(&SI, *U); 3102 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3103 SI.eraseFromParent(); 3104 return true; 3105 } 3106 3107 bool visitBitCastInst(BitCastInst &BC) { 3108 enqueueUsers(BC); 3109 return false; 3110 } 3111 3112 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3113 enqueueUsers(GEPI); 3114 return false; 3115 } 3116 3117 bool visitPHINode(PHINode &PN) { 3118 enqueueUsers(PN); 3119 return false; 3120 } 3121 3122 bool visitSelectInst(SelectInst &SI) { 3123 enqueueUsers(SI); 3124 return false; 3125 } 3126}; 3127} 3128 3129/// \brief Strip aggregate type wrapping. 3130/// 3131/// This removes no-op aggregate types wrapping an underlying type. It will 3132/// strip as many layers of types as it can without changing either the type 3133/// size or the allocated size. 3134static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3135 if (Ty->isSingleValueType()) 3136 return Ty; 3137 3138 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3139 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3140 3141 Type *InnerTy; 3142 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3143 InnerTy = ArrTy->getElementType(); 3144 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3145 const StructLayout *SL = DL.getStructLayout(STy); 3146 unsigned Index = SL->getElementContainingOffset(0); 3147 InnerTy = STy->getElementType(Index); 3148 } else { 3149 return Ty; 3150 } 3151 3152 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3153 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3154 return Ty; 3155 3156 return stripAggregateTypeWrapping(DL, InnerTy); 3157} 3158 3159/// \brief Try to find a partition of the aggregate type passed in for a given 3160/// offset and size. 3161/// 3162/// This recurses through the aggregate type and tries to compute a subtype 3163/// based on the offset and size. When the offset and size span a sub-section 3164/// of an array, it will even compute a new array type for that sub-section, 3165/// and the same for structs. 3166/// 3167/// Note that this routine is very strict and tries to find a partition of the 3168/// type which produces the *exact* right offset and size. It is not forgiving 3169/// when the size or offset cause either end of type-based partition to be off. 3170/// Also, this is a best-effort routine. It is reasonable to give up and not 3171/// return a type if necessary. 3172static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3173 uint64_t Offset, uint64_t Size) { 3174 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3175 return stripAggregateTypeWrapping(TD, Ty); 3176 3177 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3178 // We can't partition pointers... 3179 if (SeqTy->isPointerTy()) 3180 return 0; 3181 3182 Type *ElementTy = SeqTy->getElementType(); 3183 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3184 uint64_t NumSkippedElements = Offset / ElementSize; 3185 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3186 if (NumSkippedElements >= ArrTy->getNumElements()) 3187 return 0; 3188 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3189 if (NumSkippedElements >= VecTy->getNumElements()) 3190 return 0; 3191 Offset -= NumSkippedElements * ElementSize; 3192 3193 // First check if we need to recurse. 3194 if (Offset > 0 || Size < ElementSize) { 3195 // Bail if the partition ends in a different array element. 3196 if ((Offset + Size) > ElementSize) 3197 return 0; 3198 // Recurse through the element type trying to peel off offset bytes. 3199 return getTypePartition(TD, ElementTy, Offset, Size); 3200 } 3201 assert(Offset == 0); 3202 3203 if (Size == ElementSize) 3204 return stripAggregateTypeWrapping(TD, ElementTy); 3205 assert(Size > ElementSize); 3206 uint64_t NumElements = Size / ElementSize; 3207 if (NumElements * ElementSize != Size) 3208 return 0; 3209 return ArrayType::get(ElementTy, NumElements); 3210 } 3211 3212 StructType *STy = dyn_cast<StructType>(Ty); 3213 if (!STy) 3214 return 0; 3215 3216 const StructLayout *SL = TD.getStructLayout(STy); 3217 if (Offset >= SL->getSizeInBytes()) 3218 return 0; 3219 uint64_t EndOffset = Offset + Size; 3220 if (EndOffset > SL->getSizeInBytes()) 3221 return 0; 3222 3223 unsigned Index = SL->getElementContainingOffset(Offset); 3224 Offset -= SL->getElementOffset(Index); 3225 3226 Type *ElementTy = STy->getElementType(Index); 3227 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3228 if (Offset >= ElementSize) 3229 return 0; // The offset points into alignment padding. 3230 3231 // See if any partition must be contained by the element. 3232 if (Offset > 0 || Size < ElementSize) { 3233 if ((Offset + Size) > ElementSize) 3234 return 0; 3235 return getTypePartition(TD, ElementTy, Offset, Size); 3236 } 3237 assert(Offset == 0); 3238 3239 if (Size == ElementSize) 3240 return stripAggregateTypeWrapping(TD, ElementTy); 3241 3242 StructType::element_iterator EI = STy->element_begin() + Index, 3243 EE = STy->element_end(); 3244 if (EndOffset < SL->getSizeInBytes()) { 3245 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3246 if (Index == EndIndex) 3247 return 0; // Within a single element and its padding. 3248 3249 // Don't try to form "natural" types if the elements don't line up with the 3250 // expected size. 3251 // FIXME: We could potentially recurse down through the last element in the 3252 // sub-struct to find a natural end point. 3253 if (SL->getElementOffset(EndIndex) != EndOffset) 3254 return 0; 3255 3256 assert(Index < EndIndex); 3257 EE = STy->element_begin() + EndIndex; 3258 } 3259 3260 // Try to build up a sub-structure. 3261 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3262 STy->isPacked()); 3263 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3264 if (Size != SubSL->getSizeInBytes()) 3265 return 0; // The sub-struct doesn't have quite the size needed. 3266 3267 return SubTy; 3268} 3269 3270/// \brief Rewrite an alloca partition's users. 3271/// 3272/// This routine drives both of the rewriting goals of the SROA pass. It tries 3273/// to rewrite uses of an alloca partition to be conducive for SSA value 3274/// promotion. If the partition needs a new, more refined alloca, this will 3275/// build that new alloca, preserving as much type information as possible, and 3276/// rewrite the uses of the old alloca to point at the new one and have the 3277/// appropriate new offsets. It also evaluates how successful the rewrite was 3278/// at enabling promotion and if it was successful queues the alloca to be 3279/// promoted. 3280bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3281 AllocaPartitioning &P, 3282 AllocaPartitioning::iterator PI) { 3283 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3284 bool IsLive = false; 3285 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3286 UE = P.use_end(PI); 3287 UI != UE && !IsLive; ++UI) 3288 if (UI->U) 3289 IsLive = true; 3290 if (!IsLive) 3291 return false; // No live uses left of this partition. 3292 3293 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3294 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3295 3296 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3297 DEBUG(dbgs() << " speculating "); 3298 DEBUG(P.print(dbgs(), PI, "")); 3299 Speculator.visitUsers(PI); 3300 3301 // Try to compute a friendly type for this partition of the alloca. This 3302 // won't always succeed, in which case we fall back to a legal integer type 3303 // or an i8 array of an appropriate size. 3304 Type *AllocaTy = 0; 3305 if (Type *PartitionTy = P.getCommonType(PI)) 3306 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3307 AllocaTy = PartitionTy; 3308 if (!AllocaTy) 3309 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3310 PI->BeginOffset, AllocaSize)) 3311 AllocaTy = PartitionTy; 3312 if ((!AllocaTy || 3313 (AllocaTy->isArrayTy() && 3314 AllocaTy->getArrayElementType()->isIntegerTy())) && 3315 TD->isLegalInteger(AllocaSize * 8)) 3316 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3317 if (!AllocaTy) 3318 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3319 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3320 3321 // Check for the case where we're going to rewrite to a new alloca of the 3322 // exact same type as the original, and with the same access offsets. In that 3323 // case, re-use the existing alloca, but still run through the rewriter to 3324 // performe phi and select speculation. 3325 AllocaInst *NewAI; 3326 if (AllocaTy == AI.getAllocatedType()) { 3327 assert(PI->BeginOffset == 0 && 3328 "Non-zero begin offset but same alloca type"); 3329 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3330 NewAI = &AI; 3331 } else { 3332 unsigned Alignment = AI.getAlignment(); 3333 if (!Alignment) { 3334 // The minimum alignment which users can rely on when the explicit 3335 // alignment is omitted or zero is that required by the ABI for this 3336 // type. 3337 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3338 } 3339 Alignment = MinAlign(Alignment, PI->BeginOffset); 3340 // If we will get at least this much alignment from the type alone, leave 3341 // the alloca's alignment unconstrained. 3342 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3343 Alignment = 0; 3344 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3345 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3346 &AI); 3347 ++NumNewAllocas; 3348 } 3349 3350 DEBUG(dbgs() << "Rewriting alloca partition " 3351 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3352 << *NewAI << "\n"); 3353 3354 // Track the high watermark of the post-promotion worklist. We will reset it 3355 // to this point if the alloca is not in fact scheduled for promotion. 3356 unsigned PPWOldSize = PostPromotionWorklist.size(); 3357 3358 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3359 PI->BeginOffset, PI->EndOffset); 3360 DEBUG(dbgs() << " rewriting "); 3361 DEBUG(P.print(dbgs(), PI, "")); 3362 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3363 if (Promotable) { 3364 DEBUG(dbgs() << " and queuing for promotion\n"); 3365 PromotableAllocas.push_back(NewAI); 3366 } else if (NewAI != &AI) { 3367 // If we can't promote the alloca, iterate on it to check for new 3368 // refinements exposed by splitting the current alloca. Don't iterate on an 3369 // alloca which didn't actually change and didn't get promoted. 3370 Worklist.insert(NewAI); 3371 } 3372 3373 // Drop any post-promotion work items if promotion didn't happen. 3374 if (!Promotable) 3375 while (PostPromotionWorklist.size() > PPWOldSize) 3376 PostPromotionWorklist.pop_back(); 3377 3378 return true; 3379} 3380 3381/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3382bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3383 bool Changed = false; 3384 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3385 ++PI) 3386 Changed |= rewriteAllocaPartition(AI, P, PI); 3387 3388 return Changed; 3389} 3390 3391/// \brief Analyze an alloca for SROA. 3392/// 3393/// This analyzes the alloca to ensure we can reason about it, builds 3394/// a partitioning of the alloca, and then hands it off to be split and 3395/// rewritten as needed. 3396bool SROA::runOnAlloca(AllocaInst &AI) { 3397 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3398 ++NumAllocasAnalyzed; 3399 3400 // Special case dead allocas, as they're trivial. 3401 if (AI.use_empty()) { 3402 AI.eraseFromParent(); 3403 return true; 3404 } 3405 3406 // Skip alloca forms that this analysis can't handle. 3407 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3408 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3409 return false; 3410 3411 bool Changed = false; 3412 3413 // First, split any FCA loads and stores touching this alloca to promote 3414 // better splitting and promotion opportunities. 3415 AggLoadStoreRewriter AggRewriter(*TD); 3416 Changed |= AggRewriter.rewrite(AI); 3417 3418 // Build the partition set using a recursive instruction-visiting builder. 3419 AllocaPartitioning P(*TD, AI); 3420 DEBUG(P.print(dbgs())); 3421 if (P.isEscaped()) 3422 return Changed; 3423 3424 // Delete all the dead users of this alloca before splitting and rewriting it. 3425 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3426 DE = P.dead_user_end(); 3427 DI != DE; ++DI) { 3428 Changed = true; 3429 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3430 DeadInsts.push_back(*DI); 3431 } 3432 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3433 DE = P.dead_op_end(); 3434 DO != DE; ++DO) { 3435 Value *OldV = **DO; 3436 // Clobber the use with an undef value. 3437 **DO = UndefValue::get(OldV->getType()); 3438 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3439 if (isInstructionTriviallyDead(OldI)) { 3440 Changed = true; 3441 DeadInsts.push_back(OldI); 3442 } 3443 } 3444 3445 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3446 if (P.begin() == P.end()) 3447 return Changed; 3448 3449 return splitAlloca(AI, P) || Changed; 3450} 3451 3452/// \brief Delete the dead instructions accumulated in this run. 3453/// 3454/// Recursively deletes the dead instructions we've accumulated. This is done 3455/// at the very end to maximize locality of the recursive delete and to 3456/// minimize the problems of invalidated instruction pointers as such pointers 3457/// are used heavily in the intermediate stages of the algorithm. 3458/// 3459/// We also record the alloca instructions deleted here so that they aren't 3460/// subsequently handed to mem2reg to promote. 3461void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3462 DeadSplitInsts.clear(); 3463 while (!DeadInsts.empty()) { 3464 Instruction *I = DeadInsts.pop_back_val(); 3465 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3466 3467 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3468 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3469 // Zero out the operand and see if it becomes trivially dead. 3470 *OI = 0; 3471 if (isInstructionTriviallyDead(U)) 3472 DeadInsts.push_back(U); 3473 } 3474 3475 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3476 DeletedAllocas.insert(AI); 3477 3478 ++NumDeleted; 3479 I->eraseFromParent(); 3480 } 3481} 3482 3483/// \brief Promote the allocas, using the best available technique. 3484/// 3485/// This attempts to promote whatever allocas have been identified as viable in 3486/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3487/// If there is a domtree available, we attempt to promote using the full power 3488/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3489/// based on the SSAUpdater utilities. This function returns whether any 3490/// promotion occured. 3491bool SROA::promoteAllocas(Function &F) { 3492 if (PromotableAllocas.empty()) 3493 return false; 3494 3495 NumPromoted += PromotableAllocas.size(); 3496 3497 if (DT && !ForceSSAUpdater) { 3498 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3499 PromoteMemToReg(PromotableAllocas, *DT); 3500 PromotableAllocas.clear(); 3501 return true; 3502 } 3503 3504 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3505 SSAUpdater SSA; 3506 DIBuilder DIB(*F.getParent()); 3507 SmallVector<Instruction*, 64> Insts; 3508 3509 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3510 AllocaInst *AI = PromotableAllocas[Idx]; 3511 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3512 UI != UE;) { 3513 Instruction *I = cast<Instruction>(*UI++); 3514 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3515 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3516 // leading to them) here. Eventually it should use them to optimize the 3517 // scalar values produced. 3518 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3519 assert(onlyUsedByLifetimeMarkers(I) && 3520 "Found a bitcast used outside of a lifetime marker."); 3521 while (!I->use_empty()) 3522 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3523 I->eraseFromParent(); 3524 continue; 3525 } 3526 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3527 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3528 II->getIntrinsicID() == Intrinsic::lifetime_end); 3529 II->eraseFromParent(); 3530 continue; 3531 } 3532 3533 Insts.push_back(I); 3534 } 3535 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3536 Insts.clear(); 3537 } 3538 3539 PromotableAllocas.clear(); 3540 return true; 3541} 3542 3543namespace { 3544 /// \brief A predicate to test whether an alloca belongs to a set. 3545 class IsAllocaInSet { 3546 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3547 const SetType &Set; 3548 3549 public: 3550 typedef AllocaInst *argument_type; 3551 3552 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3553 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3554 }; 3555} 3556 3557bool SROA::runOnFunction(Function &F) { 3558 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3559 C = &F.getContext(); 3560 TD = getAnalysisIfAvailable<DataLayout>(); 3561 if (!TD) { 3562 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3563 return false; 3564 } 3565 DT = getAnalysisIfAvailable<DominatorTree>(); 3566 3567 BasicBlock &EntryBB = F.getEntryBlock(); 3568 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3569 I != E; ++I) 3570 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3571 Worklist.insert(AI); 3572 3573 bool Changed = false; 3574 // A set of deleted alloca instruction pointers which should be removed from 3575 // the list of promotable allocas. 3576 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3577 3578 do { 3579 while (!Worklist.empty()) { 3580 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3581 deleteDeadInstructions(DeletedAllocas); 3582 3583 // Remove the deleted allocas from various lists so that we don't try to 3584 // continue processing them. 3585 if (!DeletedAllocas.empty()) { 3586 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3587 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3588 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3589 PromotableAllocas.end(), 3590 IsAllocaInSet(DeletedAllocas)), 3591 PromotableAllocas.end()); 3592 DeletedAllocas.clear(); 3593 } 3594 } 3595 3596 Changed |= promoteAllocas(F); 3597 3598 Worklist = PostPromotionWorklist; 3599 PostPromotionWorklist.clear(); 3600 } while (!Worklist.empty()); 3601 3602 return Changed; 3603} 3604 3605void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3606 if (RequiresDomTree) 3607 AU.addRequired<DominatorTree>(); 3608 AU.setPreservesCFG(); 3609} 3610