SROA.cpp revision 72bf29f45d9da6a2a6be910f55a4787d30d3d2eb
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This transformation implements the well known scalar replacement of 11/// aggregates transformation. It tries to identify promotable elements of an 12/// aggregate alloca, and promote them to registers. It will also try to 13/// convert uses of an element (or set of elements) of an alloca into a vector 14/// or bitfield-style integer scalar if appropriate. 15/// 16/// It works to do this with minimal slicing of the alloca so that regions 17/// which are merely transferred in and out of external memory remain unchanged 18/// and are not decomposed to scalar code. 19/// 20/// Because this also performs alloca promotion, it can be thought of as also 21/// serving the purpose of SSA formation. The algorithm iterates on the 22/// function until all opportunities for promotion have been realized. 23/// 24//===----------------------------------------------------------------------===// 25 26#define DEBUG_TYPE "sroa" 27#include "llvm/Transforms/Scalar.h" 28#include "llvm/Constants.h" 29#include "llvm/DIBuilder.h" 30#include "llvm/DebugInfo.h" 31#include "llvm/DerivedTypes.h" 32#include "llvm/Function.h" 33#include "llvm/GlobalVariable.h" 34#include "llvm/IRBuilder.h" 35#include "llvm/Instructions.h" 36#include "llvm/IntrinsicInst.h" 37#include "llvm/LLVMContext.h" 38#include "llvm/Module.h" 39#include "llvm/Operator.h" 40#include "llvm/Pass.h" 41#include "llvm/ADT/SetVector.h" 42#include "llvm/ADT/SmallVector.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/STLExtras.h" 45#include "llvm/ADT/TinyPtrVector.h" 46#include "llvm/Analysis/Dominators.h" 47#include "llvm/Analysis/Loads.h" 48#include "llvm/Analysis/ValueTracking.h" 49#include "llvm/Support/CallSite.h" 50#include "llvm/Support/CommandLine.h" 51#include "llvm/Support/Debug.h" 52#include "llvm/Support/ErrorHandling.h" 53#include "llvm/Support/GetElementPtrTypeIterator.h" 54#include "llvm/Support/InstVisitor.h" 55#include "llvm/Support/MathExtras.h" 56#include "llvm/Support/ValueHandle.h" 57#include "llvm/Support/raw_ostream.h" 58#include "llvm/Target/TargetData.h" 59#include "llvm/Transforms/Utils/Local.h" 60#include "llvm/Transforms/Utils/PromoteMemToReg.h" 61#include "llvm/Transforms/Utils/SSAUpdater.h" 62using namespace llvm; 63 64STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 65STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 66STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 67STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 68STATISTIC(NumDeleted, "Number of instructions deleted"); 69STATISTIC(NumVectorized, "Number of vectorized aggregates"); 70 71/// Hidden option to force the pass to not use DomTree and mem2reg, instead 72/// forming SSA values through the SSAUpdater infrastructure. 73static cl::opt<bool> 74ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 75 76namespace { 77/// \brief Alloca partitioning representation. 78/// 79/// This class represents a partitioning of an alloca into slices, and 80/// information about the nature of uses of each slice of the alloca. The goal 81/// is that this information is sufficient to decide if and how to split the 82/// alloca apart and replace slices with scalars. It is also intended that this 83/// structure can capture the relevant information needed both to decide about 84/// and to enact these transformations. 85class AllocaPartitioning { 86public: 87 /// \brief A common base class for representing a half-open byte range. 88 struct ByteRange { 89 /// \brief The beginning offset of the range. 90 uint64_t BeginOffset; 91 92 /// \brief The ending offset, not included in the range. 93 uint64_t EndOffset; 94 95 ByteRange() : BeginOffset(), EndOffset() {} 96 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 97 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 98 99 /// \brief Support for ordering ranges. 100 /// 101 /// This provides an ordering over ranges such that start offsets are 102 /// always increasing, and within equal start offsets, the end offsets are 103 /// decreasing. Thus the spanning range comes first in a cluster with the 104 /// same start position. 105 bool operator<(const ByteRange &RHS) const { 106 if (BeginOffset < RHS.BeginOffset) return true; 107 if (BeginOffset > RHS.BeginOffset) return false; 108 if (EndOffset > RHS.EndOffset) return true; 109 return false; 110 } 111 112 /// \brief Support comparison with a single offset to allow binary searches. 113 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 114 return LHS.BeginOffset < RHSOffset; 115 } 116 117 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 118 const ByteRange &RHS) { 119 return LHSOffset < RHS.BeginOffset; 120 } 121 122 bool operator==(const ByteRange &RHS) const { 123 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 124 } 125 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 126 }; 127 128 /// \brief A partition of an alloca. 129 /// 130 /// This structure represents a contiguous partition of the alloca. These are 131 /// formed by examining the uses of the alloca. During formation, they may 132 /// overlap but once an AllocaPartitioning is built, the Partitions within it 133 /// are all disjoint. 134 struct Partition : public ByteRange { 135 /// \brief Whether this partition is splittable into smaller partitions. 136 /// 137 /// We flag partitions as splittable when they are formed entirely due to 138 /// accesses by trivially splittable operations such as memset and memcpy. 139 /// 140 /// FIXME: At some point we should consider loads and stores of FCAs to be 141 /// splittable and eagerly split them into scalar values. 142 bool IsSplittable; 143 144 Partition() : ByteRange(), IsSplittable() {} 145 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 146 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 147 }; 148 149 /// \brief A particular use of a partition of the alloca. 150 /// 151 /// This structure is used to associate uses of a partition with it. They 152 /// mark the range of bytes which are referenced by a particular instruction, 153 /// and includes a handle to the user itself and the pointer value in use. 154 /// The bounds of these uses are determined by intersecting the bounds of the 155 /// memory use itself with a particular partition. As a consequence there is 156 /// intentionally overlap between various uses of the same partition. 157 struct PartitionUse : public ByteRange { 158 /// \brief The user of this range of the alloca. 159 AssertingVH<Instruction> User; 160 161 /// \brief The particular pointer value derived from this alloca in use. 162 AssertingVH<Instruction> Ptr; 163 164 PartitionUse() : ByteRange(), User(), Ptr() {} 165 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, 166 Instruction *User, Instruction *Ptr) 167 : ByteRange(BeginOffset, EndOffset), User(User), Ptr(Ptr) {} 168 }; 169 170 /// \brief Construct a partitioning of a particular alloca. 171 /// 172 /// Construction does most of the work for partitioning the alloca. This 173 /// performs the necessary walks of users and builds a partitioning from it. 174 AllocaPartitioning(const TargetData &TD, AllocaInst &AI); 175 176 /// \brief Test whether a pointer to the allocation escapes our analysis. 177 /// 178 /// If this is true, the partitioning is never fully built and should be 179 /// ignored. 180 bool isEscaped() const { return PointerEscapingInstr; } 181 182 /// \brief Support for iterating over the partitions. 183 /// @{ 184 typedef SmallVectorImpl<Partition>::iterator iterator; 185 iterator begin() { return Partitions.begin(); } 186 iterator end() { return Partitions.end(); } 187 188 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 189 const_iterator begin() const { return Partitions.begin(); } 190 const_iterator end() const { return Partitions.end(); } 191 /// @} 192 193 /// \brief Support for iterating over and manipulating a particular 194 /// partition's uses. 195 /// 196 /// The iteration support provided for uses is more limited, but also 197 /// includes some manipulation routines to support rewriting the uses of 198 /// partitions during SROA. 199 /// @{ 200 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 201 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 202 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 203 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 204 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 205 void use_push_back(unsigned Idx, const PartitionUse &U) { 206 Uses[Idx].push_back(U); 207 } 208 void use_push_back(const_iterator I, const PartitionUse &U) { 209 Uses[I - begin()].push_back(U); 210 } 211 void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); } 212 void use_erase(const_iterator I, use_iterator UI) { 213 Uses[I - begin()].erase(UI); 214 } 215 216 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 217 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 218 const_use_iterator use_begin(const_iterator I) const { 219 return Uses[I - begin()].begin(); 220 } 221 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 222 const_use_iterator use_end(const_iterator I) const { 223 return Uses[I - begin()].end(); 224 } 225 /// @} 226 227 /// \brief Allow iterating the dead users for this alloca. 228 /// 229 /// These are instructions which will never actually use the alloca as they 230 /// are outside the allocated range. They are safe to replace with undef and 231 /// delete. 232 /// @{ 233 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 234 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 235 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 236 /// @} 237 238 /// \brief Allow iterating the dead expressions referring to this alloca. 239 /// 240 /// These are operands which have cannot actually be used to refer to the 241 /// alloca as they are outside its range and the user doesn't correct for 242 /// that. These mostly consist of PHI node inputs and the like which we just 243 /// need to replace with undef. 244 /// @{ 245 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 246 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 247 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 248 /// @} 249 250 /// \brief MemTransferInst auxiliary data. 251 /// This struct provides some auxiliary data about memory transfer 252 /// intrinsics such as memcpy and memmove. These intrinsics can use two 253 /// different ranges within the same alloca, and provide other challenges to 254 /// correctly represent. We stash extra data to help us untangle this 255 /// after the partitioning is complete. 256 struct MemTransferOffsets { 257 uint64_t DestBegin, DestEnd; 258 uint64_t SourceBegin, SourceEnd; 259 bool IsSplittable; 260 }; 261 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 262 return MemTransferInstData.lookup(&II); 263 } 264 265 /// \brief Map from a PHI or select operand back to a partition. 266 /// 267 /// When manipulating PHI nodes or selects, they can use more than one 268 /// partition of an alloca. We store a special mapping to allow finding the 269 /// partition referenced by each of these operands, if any. 270 iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) { 271 SmallDenseMap<std::pair<Instruction *, Value *>, 272 std::pair<unsigned, unsigned> >::const_iterator MapIt 273 = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); 274 if (MapIt == PHIOrSelectOpMap.end()) 275 return end(); 276 277 return begin() + MapIt->second.first; 278 } 279 280 /// \brief Map from a PHI or select operand back to the specific use of 281 /// a partition. 282 /// 283 /// Similar to mapping these operands back to the partitions, this maps 284 /// directly to the use structure of that partition. 285 use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I, 286 Value *Op) { 287 SmallDenseMap<std::pair<Instruction *, Value *>, 288 std::pair<unsigned, unsigned> >::const_iterator MapIt 289 = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); 290 assert(MapIt != PHIOrSelectOpMap.end()); 291 return Uses[MapIt->second.first].begin() + MapIt->second.second; 292 } 293 294 /// \brief Compute a common type among the uses of a particular partition. 295 /// 296 /// This routines walks all of the uses of a particular partition and tries 297 /// to find a common type between them. Untyped operations such as memset and 298 /// memcpy are ignored. 299 Type *getCommonType(iterator I) const; 300 301#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 302 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 303 void printUsers(raw_ostream &OS, const_iterator I, 304 StringRef Indent = " ") const; 305 void print(raw_ostream &OS) const; 306 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 307 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 308#endif 309 310private: 311 template <typename DerivedT, typename RetT = void> class BuilderBase; 312 class PartitionBuilder; 313 friend class AllocaPartitioning::PartitionBuilder; 314 class UseBuilder; 315 friend class AllocaPartitioning::UseBuilder; 316 317#ifndef NDEBUG 318 /// \brief Handle to alloca instruction to simplify method interfaces. 319 AllocaInst &AI; 320#endif 321 322 /// \brief The instruction responsible for this alloca having no partitioning. 323 /// 324 /// When an instruction (potentially) escapes the pointer to the alloca, we 325 /// store a pointer to that here and abort trying to partition the alloca. 326 /// This will be null if the alloca is partitioned successfully. 327 Instruction *PointerEscapingInstr; 328 329 /// \brief The partitions of the alloca. 330 /// 331 /// We store a vector of the partitions over the alloca here. This vector is 332 /// sorted by increasing begin offset, and then by decreasing end offset. See 333 /// the Partition inner class for more details. Initially (during 334 /// construction) there are overlaps, but we form a disjoint sequence of 335 /// partitions while finishing construction and a fully constructed object is 336 /// expected to always have this as a disjoint space. 337 SmallVector<Partition, 8> Partitions; 338 339 /// \brief The uses of the partitions. 340 /// 341 /// This is essentially a mapping from each partition to a list of uses of 342 /// that partition. The mapping is done with a Uses vector that has the exact 343 /// same number of entries as the partition vector. Each entry is itself 344 /// a vector of the uses. 345 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 346 347 /// \brief Instructions which will become dead if we rewrite the alloca. 348 /// 349 /// Note that these are not separated by partition. This is because we expect 350 /// a partitioned alloca to be completely rewritten or not rewritten at all. 351 /// If rewritten, all these instructions can simply be removed and replaced 352 /// with undef as they come from outside of the allocated space. 353 SmallVector<Instruction *, 8> DeadUsers; 354 355 /// \brief Operands which will become dead if we rewrite the alloca. 356 /// 357 /// These are operands that in their particular use can be replaced with 358 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 359 /// to PHI nodes and the like. They aren't entirely dead (there might be 360 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 361 /// want to swap this particular input for undef to simplify the use lists of 362 /// the alloca. 363 SmallVector<Use *, 8> DeadOperands; 364 365 /// \brief The underlying storage for auxiliary memcpy and memset info. 366 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 367 368 /// \brief A side datastructure used when building up the partitions and uses. 369 /// 370 /// This mapping is only really used during the initial building of the 371 /// partitioning so that we can retain information about PHI and select nodes 372 /// processed. 373 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 374 375 /// \brief Auxiliary information for particular PHI or select operands. 376 SmallDenseMap<std::pair<Instruction *, Value *>, 377 std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 378 379 /// \brief A utility routine called from the constructor. 380 /// 381 /// This does what it says on the tin. It is the key of the alloca partition 382 /// splitting and merging. After it is called we have the desired disjoint 383 /// collection of partitions. 384 void splitAndMergePartitions(); 385}; 386} 387 388template <typename DerivedT, typename RetT> 389class AllocaPartitioning::BuilderBase 390 : public InstVisitor<DerivedT, RetT> { 391public: 392 BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) 393 : TD(TD), 394 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 395 P(P) { 396 enqueueUsers(AI, 0); 397 } 398 399protected: 400 const TargetData &TD; 401 const uint64_t AllocSize; 402 AllocaPartitioning &P; 403 404 struct OffsetUse { 405 Use *U; 406 int64_t Offset; 407 }; 408 SmallVector<OffsetUse, 8> Queue; 409 410 // The active offset and use while visiting. 411 Use *U; 412 int64_t Offset; 413 414 void enqueueUsers(Instruction &I, int64_t UserOffset) { 415 SmallPtrSet<User *, 8> UserSet; 416 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); 417 UI != UE; ++UI) { 418 if (!UserSet.insert(*UI)) 419 continue; 420 421 OffsetUse OU = { &UI.getUse(), UserOffset }; 422 Queue.push_back(OU); 423 } 424 } 425 426 bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) { 427 GEPOffset = Offset; 428 for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); 429 GTI != GTE; ++GTI) { 430 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 431 if (!OpC) 432 return false; 433 if (OpC->isZero()) 434 continue; 435 436 // Handle a struct index, which adds its field offset to the pointer. 437 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 438 unsigned ElementIdx = OpC->getZExtValue(); 439 const StructLayout *SL = TD.getStructLayout(STy); 440 uint64_t ElementOffset = SL->getElementOffset(ElementIdx); 441 // Check that we can continue to model this GEP in a signed 64-bit offset. 442 if (ElementOffset > INT64_MAX || 443 (GEPOffset >= 0 && 444 ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) { 445 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 446 << "what can be represented in an int64_t!\n" 447 << " alloca: " << P.AI << "\n"); 448 return false; 449 } 450 if (GEPOffset < 0) 451 GEPOffset = ElementOffset + (uint64_t)-GEPOffset; 452 else 453 GEPOffset += ElementOffset; 454 continue; 455 } 456 457 APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits()); 458 Index *= APInt(Index.getBitWidth(), 459 TD.getTypeAllocSize(GTI.getIndexedType())); 460 Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset, 461 /*isSigned*/true); 462 // Check if the result can be stored in our int64_t offset. 463 if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) { 464 DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " 465 << "what can be represented in an int64_t!\n" 466 << " alloca: " << P.AI << "\n"); 467 return false; 468 } 469 470 GEPOffset = Index.getSExtValue(); 471 } 472 return true; 473 } 474 475 Value *foldSelectInst(SelectInst &SI) { 476 // If the condition being selected on is a constant or the same value is 477 // being selected between, fold the select. Yes this does (rarely) happen 478 // early on. 479 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 480 return SI.getOperand(1+CI->isZero()); 481 if (SI.getOperand(1) == SI.getOperand(2)) { 482 assert(*U == SI.getOperand(1)); 483 return SI.getOperand(1); 484 } 485 return 0; 486 } 487}; 488 489/// \brief Builder for the alloca partitioning. 490/// 491/// This class builds an alloca partitioning by recursively visiting the uses 492/// of an alloca and splitting the partitions for each load and store at each 493/// offset. 494class AllocaPartitioning::PartitionBuilder 495 : public BuilderBase<PartitionBuilder, bool> { 496 friend class InstVisitor<PartitionBuilder, bool>; 497 498 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 499 500public: 501 PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) 502 : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {} 503 504 /// \brief Run the builder over the allocation. 505 bool operator()() { 506 // Note that we have to re-evaluate size on each trip through the loop as 507 // the queue grows at the tail. 508 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 509 U = Queue[Idx].U; 510 Offset = Queue[Idx].Offset; 511 if (!visit(cast<Instruction>(U->getUser()))) 512 return false; 513 } 514 return true; 515 } 516 517private: 518 bool markAsEscaping(Instruction &I) { 519 P.PointerEscapingInstr = &I; 520 return false; 521 } 522 523 void insertUse(Instruction &I, int64_t Offset, uint64_t Size, 524 bool IsSplittable = false) { 525 // Completely skip uses which don't overlap the allocation. 526 if ((Offset >= 0 && (uint64_t)Offset >= AllocSize) || 527 (Offset < 0 && (uint64_t)-Offset >= Size)) { 528 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 529 << " which starts past the end of the " << AllocSize 530 << " byte alloca:\n" 531 << " alloca: " << P.AI << "\n" 532 << " use: " << I << "\n"); 533 return; 534 } 535 536 // Clamp the start to the beginning of the allocation. 537 if (Offset < 0) { 538 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 539 << " to start at the beginning of the alloca:\n" 540 << " alloca: " << P.AI << "\n" 541 << " use: " << I << "\n"); 542 Size -= (uint64_t)-Offset; 543 Offset = 0; 544 } 545 546 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 547 548 // Clamp the end offset to the end of the allocation. Note that this is 549 // formulated to handle even the case where "BeginOffset + Size" overflows. 550 assert(AllocSize >= BeginOffset); // Established above. 551 if (Size > AllocSize - BeginOffset) { 552 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 553 << " to remain within the " << AllocSize << " byte alloca:\n" 554 << " alloca: " << P.AI << "\n" 555 << " use: " << I << "\n"); 556 EndOffset = AllocSize; 557 } 558 559 // See if we can just add a user onto the last slot currently occupied. 560 if (!P.Partitions.empty() && 561 P.Partitions.back().BeginOffset == BeginOffset && 562 P.Partitions.back().EndOffset == EndOffset) { 563 P.Partitions.back().IsSplittable &= IsSplittable; 564 return; 565 } 566 567 Partition New(BeginOffset, EndOffset, IsSplittable); 568 P.Partitions.push_back(New); 569 } 570 571 bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 572 uint64_t Size = TD.getTypeStoreSize(Ty); 573 574 // If this memory access can be shown to *statically* extend outside the 575 // bounds of of the allocation, it's behavior is undefined, so simply 576 // ignore it. Note that this is more strict than the generic clamping 577 // behavior of insertUse. We also try to handle cases which might run the 578 // risk of overflow. 579 // FIXME: We should instead consider the pointer to have escaped if this 580 // function is being instrumented for addressing bugs or race conditions. 581 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 582 Size > (AllocSize - (uint64_t)Offset)) { 583 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " 584 << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset 585 << " which extends past the end of the " << AllocSize 586 << " byte alloca:\n" 587 << " alloca: " << P.AI << "\n" 588 << " use: " << I << "\n"); 589 return true; 590 } 591 592 insertUse(I, Offset, Size); 593 return true; 594 } 595 596 bool visitBitCastInst(BitCastInst &BC) { 597 enqueueUsers(BC, Offset); 598 return true; 599 } 600 601 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 602 int64_t GEPOffset; 603 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 604 return markAsEscaping(GEPI); 605 606 enqueueUsers(GEPI, GEPOffset); 607 return true; 608 } 609 610 bool visitLoadInst(LoadInst &LI) { 611 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 612 "All simple FCA loads should have been pre-split"); 613 return handleLoadOrStore(LI.getType(), LI, Offset); 614 } 615 616 bool visitStoreInst(StoreInst &SI) { 617 Value *ValOp = SI.getValueOperand(); 618 if (ValOp == *U) 619 return markAsEscaping(SI); 620 621 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 622 "All simple FCA stores should have been pre-split"); 623 return handleLoadOrStore(ValOp->getType(), SI, Offset); 624 } 625 626 627 bool visitMemSetInst(MemSetInst &II) { 628 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 629 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 630 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 631 insertUse(II, Offset, Size, Length); 632 return true; 633 } 634 635 bool visitMemTransferInst(MemTransferInst &II) { 636 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 637 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 638 if (!Size) 639 // Zero-length mem transfer intrinsics can be ignored entirely. 640 return true; 641 642 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 643 644 // Only intrinsics with a constant length can be split. 645 Offsets.IsSplittable = Length; 646 647 if (*U != II.getRawDest()) { 648 assert(*U == II.getRawSource()); 649 Offsets.SourceBegin = Offset; 650 Offsets.SourceEnd = Offset + Size; 651 } else { 652 Offsets.DestBegin = Offset; 653 Offsets.DestEnd = Offset + Size; 654 } 655 656 insertUse(II, Offset, Size, Offsets.IsSplittable); 657 unsigned NewIdx = P.Partitions.size() - 1; 658 659 SmallDenseMap<Instruction *, unsigned>::const_iterator PMI; 660 bool Inserted = false; 661 llvm::tie(PMI, Inserted) 662 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)); 663 if (!Inserted && Offsets.IsSplittable) { 664 // We've found a memory transfer intrinsic which refers to the alloca as 665 // both a source and dest. We refuse to split these to simplify splitting 666 // logic. If possible, SROA will still split them into separate allocas 667 // and then re-analyze. 668 Offsets.IsSplittable = false; 669 P.Partitions[PMI->second].IsSplittable = false; 670 P.Partitions[NewIdx].IsSplittable = false; 671 } 672 673 return true; 674 } 675 676 // Disable SRoA for any intrinsics except for lifetime invariants. 677 // FIXME: What about debug instrinsics? This matches old behavior, but 678 // doesn't make sense. 679 bool visitIntrinsicInst(IntrinsicInst &II) { 680 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 681 II.getIntrinsicID() == Intrinsic::lifetime_end) { 682 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 683 uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue()); 684 insertUse(II, Offset, Size, true); 685 return true; 686 } 687 688 return markAsEscaping(II); 689 } 690 691 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 692 // We consider any PHI or select that results in a direct load or store of 693 // the same offset to be a viable use for partitioning purposes. These uses 694 // are considered unsplittable and the size is the maximum loaded or stored 695 // size. 696 SmallPtrSet<Instruction *, 4> Visited; 697 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 698 Visited.insert(Root); 699 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 700 do { 701 Instruction *I, *UsedI; 702 llvm::tie(UsedI, I) = Uses.pop_back_val(); 703 704 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 705 Size = std::max(Size, TD.getTypeStoreSize(LI->getType())); 706 continue; 707 } 708 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 709 Value *Op = SI->getOperand(0); 710 if (Op == UsedI) 711 return SI; 712 Size = std::max(Size, TD.getTypeStoreSize(Op->getType())); 713 continue; 714 } 715 716 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 717 if (!GEP->hasAllZeroIndices()) 718 return GEP; 719 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 720 !isa<SelectInst>(I)) { 721 return I; 722 } 723 724 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 725 ++UI) 726 if (Visited.insert(cast<Instruction>(*UI))) 727 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 728 } while (!Uses.empty()); 729 730 return 0; 731 } 732 733 bool visitPHINode(PHINode &PN) { 734 // See if we already have computed info on this node. 735 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 736 if (PHIInfo.first) { 737 PHIInfo.second = true; 738 insertUse(PN, Offset, PHIInfo.first); 739 return true; 740 } 741 742 // Check for an unsafe use of the PHI node. 743 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 744 return markAsEscaping(*EscapingI); 745 746 insertUse(PN, Offset, PHIInfo.first); 747 return true; 748 } 749 750 bool visitSelectInst(SelectInst &SI) { 751 if (Value *Result = foldSelectInst(SI)) { 752 if (Result == *U) 753 // If the result of the constant fold will be the pointer, recurse 754 // through the select as if we had RAUW'ed it. 755 enqueueUsers(SI, Offset); 756 757 return true; 758 } 759 760 // See if we already have computed info on this node. 761 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 762 if (SelectInfo.first) { 763 SelectInfo.second = true; 764 insertUse(SI, Offset, SelectInfo.first); 765 return true; 766 } 767 768 // Check for an unsafe use of the PHI node. 769 if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 770 return markAsEscaping(*EscapingI); 771 772 insertUse(SI, Offset, SelectInfo.first); 773 return true; 774 } 775 776 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 777 bool visitInstruction(Instruction &I) { return markAsEscaping(I); } 778}; 779 780 781/// \brief Use adder for the alloca partitioning. 782/// 783/// This class adds the uses of an alloca to all of the partitions which they 784/// use. For splittable partitions, this can end up doing essentially a linear 785/// walk of the partitions, but the number of steps remains bounded by the 786/// total result instruction size: 787/// - The number of partitions is a result of the number unsplittable 788/// instructions using the alloca. 789/// - The number of users of each partition is at worst the total number of 790/// splittable instructions using the alloca. 791/// Thus we will produce N * M instructions in the end, where N are the number 792/// of unsplittable uses and M are the number of splittable. This visitor does 793/// the exact same number of updates to the partitioning. 794/// 795/// In the more common case, this visitor will leverage the fact that the 796/// partition space is pre-sorted, and do a logarithmic search for the 797/// partition needed, making the total visit a classical ((N + M) * log(N)) 798/// complexity operation. 799class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> { 800 friend class InstVisitor<UseBuilder>; 801 802 /// \brief Set to de-duplicate dead instructions found in the use walk. 803 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 804 805public: 806 UseBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) 807 : BuilderBase<UseBuilder>(TD, AI, P) {} 808 809 /// \brief Run the builder over the allocation. 810 void operator()() { 811 // Note that we have to re-evaluate size on each trip through the loop as 812 // the queue grows at the tail. 813 for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { 814 U = Queue[Idx].U; 815 Offset = Queue[Idx].Offset; 816 this->visit(cast<Instruction>(U->getUser())); 817 } 818 } 819 820private: 821 void markAsDead(Instruction &I) { 822 if (VisitedDeadInsts.insert(&I)) 823 P.DeadUsers.push_back(&I); 824 } 825 826 void insertUse(Instruction &User, int64_t Offset, uint64_t Size) { 827 // If the use extends outside of the allocation, record it as a dead use 828 // for elimination later. 829 if ((uint64_t)Offset >= AllocSize || 830 (Offset < 0 && (uint64_t)-Offset >= Size)) 831 return markAsDead(User); 832 833 // Clamp the start to the beginning of the allocation. 834 if (Offset < 0) { 835 Size -= (uint64_t)-Offset; 836 Offset = 0; 837 } 838 839 uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; 840 841 // Clamp the end offset to the end of the allocation. Note that this is 842 // formulated to handle even the case where "BeginOffset + Size" overflows. 843 assert(AllocSize >= BeginOffset); // Established above. 844 if (Size > AllocSize - BeginOffset) 845 EndOffset = AllocSize; 846 847 // NB: This only works if we have zero overlapping partitions. 848 iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); 849 if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) 850 B = llvm::prior(B); 851 for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; 852 ++I) { 853 PartitionUse NewUse(std::max(I->BeginOffset, BeginOffset), 854 std::min(I->EndOffset, EndOffset), 855 &User, cast<Instruction>(*U)); 856 P.use_push_back(I, NewUse); 857 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 858 P.PHIOrSelectOpMap[std::make_pair(&User, U->get())] 859 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 860 } 861 } 862 863 void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { 864 uint64_t Size = TD.getTypeStoreSize(Ty); 865 866 // If this memory access can be shown to *statically* extend outside the 867 // bounds of of the allocation, it's behavior is undefined, so simply 868 // ignore it. Note that this is more strict than the generic clamping 869 // behavior of insertUse. 870 if (Offset < 0 || (uint64_t)Offset >= AllocSize || 871 Size > (AllocSize - (uint64_t)Offset)) 872 return markAsDead(I); 873 874 insertUse(I, Offset, Size); 875 } 876 877 void visitBitCastInst(BitCastInst &BC) { 878 if (BC.use_empty()) 879 return markAsDead(BC); 880 881 enqueueUsers(BC, Offset); 882 } 883 884 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 885 if (GEPI.use_empty()) 886 return markAsDead(GEPI); 887 888 int64_t GEPOffset; 889 if (!computeConstantGEPOffset(GEPI, GEPOffset)) 890 llvm_unreachable("Unable to compute constant offset for use"); 891 892 enqueueUsers(GEPI, GEPOffset); 893 } 894 895 void visitLoadInst(LoadInst &LI) { 896 handleLoadOrStore(LI.getType(), LI, Offset); 897 } 898 899 void visitStoreInst(StoreInst &SI) { 900 handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); 901 } 902 903 void visitMemSetInst(MemSetInst &II) { 904 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 905 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 906 insertUse(II, Offset, Size); 907 } 908 909 void visitMemTransferInst(MemTransferInst &II) { 910 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 911 uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; 912 insertUse(II, Offset, Size); 913 } 914 915 void visitIntrinsicInst(IntrinsicInst &II) { 916 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 917 II.getIntrinsicID() == Intrinsic::lifetime_end); 918 919 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 920 insertUse(II, Offset, 921 std::min(AllocSize - Offset, Length->getLimitedValue())); 922 } 923 924 void insertPHIOrSelect(Instruction &User, uint64_t Offset) { 925 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 926 927 // For PHI and select operands outside the alloca, we can't nuke the entire 928 // phi or select -- the other side might still be relevant, so we special 929 // case them here and use a separate structure to track the operands 930 // themselves which should be replaced with undef. 931 if (Offset >= AllocSize) { 932 P.DeadOperands.push_back(U); 933 return; 934 } 935 936 insertUse(User, Offset, Size); 937 } 938 void visitPHINode(PHINode &PN) { 939 if (PN.use_empty()) 940 return markAsDead(PN); 941 942 insertPHIOrSelect(PN, Offset); 943 } 944 void visitSelectInst(SelectInst &SI) { 945 if (SI.use_empty()) 946 return markAsDead(SI); 947 948 if (Value *Result = foldSelectInst(SI)) { 949 if (Result == *U) 950 // If the result of the constant fold will be the pointer, recurse 951 // through the select as if we had RAUW'ed it. 952 enqueueUsers(SI, Offset); 953 else 954 // Otherwise the operand to the select is dead, and we can replace it 955 // with undef. 956 P.DeadOperands.push_back(U); 957 958 return; 959 } 960 961 insertPHIOrSelect(SI, Offset); 962 } 963 964 /// \brief Unreachable, we've already visited the alloca once. 965 void visitInstruction(Instruction &I) { 966 llvm_unreachable("Unhandled instruction in use builder."); 967 } 968}; 969 970void AllocaPartitioning::splitAndMergePartitions() { 971 size_t NumDeadPartitions = 0; 972 973 // Track the range of splittable partitions that we pass when accumulating 974 // overlapping unsplittable partitions. 975 uint64_t SplitEndOffset = 0ull; 976 977 Partition New(0ull, 0ull, false); 978 979 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 980 ++j; 981 982 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 983 assert(New.BeginOffset == New.EndOffset); 984 New = Partitions[i]; 985 } else { 986 assert(New.IsSplittable); 987 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 988 } 989 assert(New.BeginOffset != New.EndOffset); 990 991 // Scan the overlapping partitions. 992 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 993 // If the new partition we are forming is splittable, stop at the first 994 // unsplittable partition. 995 if (New.IsSplittable && !Partitions[j].IsSplittable) 996 break; 997 998 // Grow the new partition to include any equally splittable range. 'j' is 999 // always equally splittable when New is splittable, but when New is not 1000 // splittable, we may subsume some (or part of some) splitable partition 1001 // without growing the new one. 1002 if (New.IsSplittable == Partitions[j].IsSplittable) { 1003 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 1004 } else { 1005 assert(!New.IsSplittable); 1006 assert(Partitions[j].IsSplittable); 1007 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 1008 } 1009 1010 Partitions[j].BeginOffset = Partitions[j].EndOffset = UINT64_MAX; 1011 ++NumDeadPartitions; 1012 ++j; 1013 } 1014 1015 // If the new partition is splittable, chop off the end as soon as the 1016 // unsplittable subsequent partition starts and ensure we eventually cover 1017 // the splittable area. 1018 if (j != e && New.IsSplittable) { 1019 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 1020 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1021 } 1022 1023 // Add the new partition if it differs from the original one and is 1024 // non-empty. We can end up with an empty partition here if it was 1025 // splittable but there is an unsplittable one that starts at the same 1026 // offset. 1027 if (New != Partitions[i]) { 1028 if (New.BeginOffset != New.EndOffset) 1029 Partitions.push_back(New); 1030 // Mark the old one for removal. 1031 Partitions[i].BeginOffset = Partitions[i].EndOffset = UINT64_MAX; 1032 ++NumDeadPartitions; 1033 } 1034 1035 New.BeginOffset = New.EndOffset; 1036 if (!New.IsSplittable) { 1037 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1038 if (j != e && !Partitions[j].IsSplittable) 1039 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1040 New.IsSplittable = true; 1041 // If there is a trailing splittable partition which won't be fused into 1042 // the next splittable partition go ahead and add it onto the partitions 1043 // list. 1044 if (New.BeginOffset < New.EndOffset && 1045 (j == e || !Partitions[j].IsSplittable || 1046 New.EndOffset < Partitions[j].BeginOffset)) { 1047 Partitions.push_back(New); 1048 New.BeginOffset = New.EndOffset = 0ull; 1049 } 1050 } 1051 } 1052 1053 // Re-sort the partitions now that they have been split and merged into 1054 // disjoint set of partitions. Also remove any of the dead partitions we've 1055 // replaced in the process. 1056 std::sort(Partitions.begin(), Partitions.end()); 1057 if (NumDeadPartitions) { 1058 assert(Partitions.back().BeginOffset == UINT64_MAX); 1059 assert(Partitions.back().EndOffset == UINT64_MAX); 1060 assert((ptrdiff_t)NumDeadPartitions == 1061 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1062 } 1063 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1064} 1065 1066AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI) 1067 : 1068#ifndef NDEBUG 1069 AI(AI), 1070#endif 1071 PointerEscapingInstr(0) { 1072 PartitionBuilder PB(TD, AI, *this); 1073 if (!PB()) 1074 return; 1075 1076 if (Partitions.size() > 1) { 1077 // Sort the uses. This arranges for the offsets to be in ascending order, 1078 // and the sizes to be in descending order. 1079 std::sort(Partitions.begin(), Partitions.end()); 1080 1081 // Intersect splittability for all partitions with equal offsets and sizes. 1082 // Then remove all but the first so that we have a sequence of non-equal but 1083 // potentially overlapping partitions. 1084 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1085 I = J) { 1086 ++J; 1087 while (J != E && *I == *J) { 1088 I->IsSplittable &= J->IsSplittable; 1089 ++J; 1090 } 1091 } 1092 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1093 Partitions.end()); 1094 1095 // Split splittable and merge unsplittable partitions into a disjoint set 1096 // of partitions over the used space of the allocation. 1097 splitAndMergePartitions(); 1098 } 1099 1100 // Now build up the user lists for each of these disjoint partitions by 1101 // re-walking the recursive users of the alloca. 1102 Uses.resize(Partitions.size()); 1103 UseBuilder UB(TD, AI, *this); 1104 UB(); 1105} 1106 1107Type *AllocaPartitioning::getCommonType(iterator I) const { 1108 Type *Ty = 0; 1109 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1110 if (isa<IntrinsicInst>(*UI->User)) 1111 continue; 1112 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1113 continue; 1114 1115 Type *UserTy = 0; 1116 if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) { 1117 UserTy = LI->getType(); 1118 } else if (StoreInst *SI = dyn_cast<StoreInst>(&*UI->User)) { 1119 UserTy = SI->getValueOperand()->getType(); 1120 } else if (SelectInst *SI = dyn_cast<SelectInst>(&*UI->User)) { 1121 if (PointerType *PtrTy = dyn_cast<PointerType>(SI->getType())) 1122 UserTy = PtrTy->getElementType(); 1123 } else if (PHINode *PN = dyn_cast<PHINode>(&*UI->User)) { 1124 if (PointerType *PtrTy = dyn_cast<PointerType>(PN->getType())) 1125 UserTy = PtrTy->getElementType(); 1126 } 1127 1128 if (Ty && Ty != UserTy) 1129 return 0; 1130 1131 Ty = UserTy; 1132 } 1133 return Ty; 1134} 1135 1136#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1137 1138void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1139 StringRef Indent) const { 1140 OS << Indent << "partition #" << (I - begin()) 1141 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1142 << (I->IsSplittable ? " (splittable)" : "") 1143 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1144 << "\n"; 1145} 1146 1147void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1148 StringRef Indent) const { 1149 for (const_use_iterator UI = use_begin(I), UE = use_end(I); 1150 UI != UE; ++UI) { 1151 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1152 << "used by: " << *UI->User << "\n"; 1153 if (MemTransferInst *II = dyn_cast<MemTransferInst>(&*UI->User)) { 1154 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1155 bool IsDest; 1156 if (!MTO.IsSplittable) 1157 IsDest = UI->BeginOffset == MTO.DestBegin; 1158 else 1159 IsDest = MTO.DestBegin != 0u; 1160 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1161 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1162 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1163 } 1164 } 1165} 1166 1167void AllocaPartitioning::print(raw_ostream &OS) const { 1168 if (PointerEscapingInstr) { 1169 OS << "No partitioning for alloca: " << AI << "\n" 1170 << " A pointer to this alloca escaped by:\n" 1171 << " " << *PointerEscapingInstr << "\n"; 1172 return; 1173 } 1174 1175 OS << "Partitioning of alloca: " << AI << "\n"; 1176 unsigned Num = 0; 1177 for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { 1178 print(OS, I); 1179 printUsers(OS, I); 1180 } 1181} 1182 1183void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1184void AllocaPartitioning::dump() const { print(dbgs()); } 1185 1186#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1187 1188 1189namespace { 1190/// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1191/// 1192/// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1193/// the loads and stores of an alloca instruction, as well as updating its 1194/// debug information. This is used when a domtree is unavailable and thus 1195/// mem2reg in its full form can't be used to handle promotion of allocas to 1196/// scalar values. 1197class AllocaPromoter : public LoadAndStorePromoter { 1198 AllocaInst &AI; 1199 DIBuilder &DIB; 1200 1201 SmallVector<DbgDeclareInst *, 4> DDIs; 1202 SmallVector<DbgValueInst *, 4> DVIs; 1203 1204public: 1205 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1206 AllocaInst &AI, DIBuilder &DIB) 1207 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1208 1209 void run(const SmallVectorImpl<Instruction*> &Insts) { 1210 // Remember which alloca we're promoting (for isInstInList). 1211 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1212 for (Value::use_iterator UI = DebugNode->use_begin(), 1213 UE = DebugNode->use_end(); 1214 UI != UE; ++UI) 1215 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1216 DDIs.push_back(DDI); 1217 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1218 DVIs.push_back(DVI); 1219 } 1220 1221 LoadAndStorePromoter::run(Insts); 1222 AI.eraseFromParent(); 1223 while (!DDIs.empty()) 1224 DDIs.pop_back_val()->eraseFromParent(); 1225 while (!DVIs.empty()) 1226 DVIs.pop_back_val()->eraseFromParent(); 1227 } 1228 1229 virtual bool isInstInList(Instruction *I, 1230 const SmallVectorImpl<Instruction*> &Insts) const { 1231 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1232 return LI->getOperand(0) == &AI; 1233 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1234 } 1235 1236 virtual void updateDebugInfo(Instruction *Inst) const { 1237 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1238 E = DDIs.end(); I != E; ++I) { 1239 DbgDeclareInst *DDI = *I; 1240 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1241 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1242 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1243 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1244 } 1245 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1246 E = DVIs.end(); I != E; ++I) { 1247 DbgValueInst *DVI = *I; 1248 Value *Arg = NULL; 1249 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1250 // If an argument is zero extended then use argument directly. The ZExt 1251 // may be zapped by an optimization pass in future. 1252 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1253 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1254 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1255 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1256 if (!Arg) 1257 Arg = SI->getOperand(0); 1258 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1259 Arg = LI->getOperand(0); 1260 } else { 1261 continue; 1262 } 1263 Instruction *DbgVal = 1264 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1265 Inst); 1266 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1267 } 1268 } 1269}; 1270} // end anon namespace 1271 1272 1273namespace { 1274/// \brief An optimization pass providing Scalar Replacement of Aggregates. 1275/// 1276/// This pass takes allocations which can be completely analyzed (that is, they 1277/// don't escape) and tries to turn them into scalar SSA values. There are 1278/// a few steps to this process. 1279/// 1280/// 1) It takes allocations of aggregates and analyzes the ways in which they 1281/// are used to try to split them into smaller allocations, ideally of 1282/// a single scalar data type. It will split up memcpy and memset accesses 1283/// as necessary and try to isolate invidual scalar accesses. 1284/// 2) It will transform accesses into forms which are suitable for SSA value 1285/// promotion. This can be replacing a memset with a scalar store of an 1286/// integer value, or it can involve speculating operations on a PHI or 1287/// select to be a PHI or select of the results. 1288/// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1289/// onto insert and extract operations on a vector value, and convert them to 1290/// this form. By doing so, it will enable promotion of vector aggregates to 1291/// SSA vector values. 1292class SROA : public FunctionPass { 1293 const bool RequiresDomTree; 1294 1295 LLVMContext *C; 1296 const TargetData *TD; 1297 DominatorTree *DT; 1298 1299 /// \brief Worklist of alloca instructions to simplify. 1300 /// 1301 /// Each alloca in the function is added to this. Each new alloca formed gets 1302 /// added to it as well to recursively simplify unless that alloca can be 1303 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1304 /// the one being actively rewritten, we add it back onto the list if not 1305 /// already present to ensure it is re-visited. 1306 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1307 1308 /// \brief A collection of instructions to delete. 1309 /// We try to batch deletions to simplify code and make things a bit more 1310 /// efficient. 1311 SmallVector<Instruction *, 8> DeadInsts; 1312 1313 /// \brief A set to prevent repeatedly marking an instruction split into many 1314 /// uses as dead. Only used to guard insertion into DeadInsts. 1315 SmallPtrSet<Instruction *, 4> DeadSplitInsts; 1316 1317 /// \brief A collection of alloca instructions we can directly promote. 1318 std::vector<AllocaInst *> PromotableAllocas; 1319 1320public: 1321 SROA(bool RequiresDomTree = true) 1322 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1323 C(0), TD(0), DT(0) { 1324 initializeSROAPass(*PassRegistry::getPassRegistry()); 1325 } 1326 bool runOnFunction(Function &F); 1327 void getAnalysisUsage(AnalysisUsage &AU) const; 1328 1329 const char *getPassName() const { return "SROA"; } 1330 static char ID; 1331 1332private: 1333 friend class AllocaPartitionRewriter; 1334 friend class AllocaPartitionVectorRewriter; 1335 1336 bool rewriteAllocaPartition(AllocaInst &AI, 1337 AllocaPartitioning &P, 1338 AllocaPartitioning::iterator PI); 1339 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1340 bool runOnAlloca(AllocaInst &AI); 1341 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1342 bool promoteAllocas(Function &F); 1343}; 1344} 1345 1346char SROA::ID = 0; 1347 1348FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1349 return new SROA(RequiresDomTree); 1350} 1351 1352INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1353 false, false) 1354INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1355INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1356 false, false) 1357 1358/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. 1359/// 1360/// If the provided GEP is all-constant, the total byte offset formed by the 1361/// GEP is computed and Offset is set to it. If the GEP has any non-constant 1362/// operands, the function returns false and the value of Offset is unmodified. 1363static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP, 1364 APInt &Offset) { 1365 APInt GEPOffset(Offset.getBitWidth(), 0); 1366 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 1367 GTI != GTE; ++GTI) { 1368 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1369 if (!OpC) 1370 return false; 1371 if (OpC->isZero()) continue; 1372 1373 // Handle a struct index, which adds its field offset to the pointer. 1374 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 1375 unsigned ElementIdx = OpC->getZExtValue(); 1376 const StructLayout *SL = TD.getStructLayout(STy); 1377 GEPOffset += APInt(Offset.getBitWidth(), 1378 SL->getElementOffset(ElementIdx)); 1379 continue; 1380 } 1381 1382 APInt TypeSize(Offset.getBitWidth(), 1383 TD.getTypeAllocSize(GTI.getIndexedType())); 1384 if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) { 1385 assert((VTy->getScalarSizeInBits() % 8) == 0 && 1386 "vector element size is not a multiple of 8, cannot GEP over it"); 1387 TypeSize = VTy->getScalarSizeInBits() / 8; 1388 } 1389 1390 GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; 1391 } 1392 Offset = GEPOffset; 1393 return true; 1394} 1395 1396/// \brief Build a GEP out of a base pointer and indices. 1397/// 1398/// This will return the BasePtr if that is valid, or build a new GEP 1399/// instruction using the IRBuilder if GEP-ing is needed. 1400static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, 1401 SmallVectorImpl<Value *> &Indices, 1402 const Twine &Prefix) { 1403 if (Indices.empty()) 1404 return BasePtr; 1405 1406 // A single zero index is a no-op, so check for this and avoid building a GEP 1407 // in that case. 1408 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1409 return BasePtr; 1410 1411 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1412} 1413 1414/// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1415/// TargetTy without changing the offset of the pointer. 1416/// 1417/// This routine assumes we've already established a properly offset GEP with 1418/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1419/// zero-indices down through type layers until we find one the same as 1420/// TargetTy. If we can't find one with the same type, we at least try to use 1421/// one with the same size. If none of that works, we just produce the GEP as 1422/// indicated by Indices to have the correct offset. 1423static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD, 1424 Value *BasePtr, Type *Ty, Type *TargetTy, 1425 SmallVectorImpl<Value *> &Indices, 1426 const Twine &Prefix) { 1427 if (Ty == TargetTy) 1428 return buildGEP(IRB, BasePtr, Indices, Prefix); 1429 1430 // See if we can descend into a struct and locate a field with the correct 1431 // type. 1432 unsigned NumLayers = 0; 1433 Type *ElementTy = Ty; 1434 do { 1435 if (ElementTy->isPointerTy()) 1436 break; 1437 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1438 ElementTy = SeqTy->getElementType(); 1439 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0))); 1440 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1441 ElementTy = *STy->element_begin(); 1442 Indices.push_back(IRB.getInt32(0)); 1443 } else { 1444 break; 1445 } 1446 ++NumLayers; 1447 } while (ElementTy != TargetTy); 1448 if (ElementTy != TargetTy) 1449 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1450 1451 return buildGEP(IRB, BasePtr, Indices, Prefix); 1452} 1453 1454/// \brief Recursively compute indices for a natural GEP. 1455/// 1456/// This is the recursive step for getNaturalGEPWithOffset that walks down the 1457/// element types adding appropriate indices for the GEP. 1458static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD, 1459 Value *Ptr, Type *Ty, APInt &Offset, 1460 Type *TargetTy, 1461 SmallVectorImpl<Value *> &Indices, 1462 const Twine &Prefix) { 1463 if (Offset == 0) 1464 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1465 1466 // We can't recurse through pointer types. 1467 if (Ty->isPointerTy()) 1468 return 0; 1469 1470 // We try to analyze GEPs over vectors here, but note that these GEPs are 1471 // extremely poorly defined currently. The long-term goal is to remove GEPing 1472 // over a vector from the IR completely. 1473 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1474 unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); 1475 if (ElementSizeInBits % 8) 1476 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1477 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1478 APInt NumSkippedElements = Offset.udiv(ElementSize); 1479 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1480 return 0; 1481 Offset -= NumSkippedElements * ElementSize; 1482 Indices.push_back(IRB.getInt(NumSkippedElements)); 1483 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1484 Offset, TargetTy, Indices, Prefix); 1485 } 1486 1487 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1488 Type *ElementTy = ArrTy->getElementType(); 1489 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1490 APInt NumSkippedElements = Offset.udiv(ElementSize); 1491 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1492 return 0; 1493 1494 Offset -= NumSkippedElements * ElementSize; 1495 Indices.push_back(IRB.getInt(NumSkippedElements)); 1496 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1497 Indices, Prefix); 1498 } 1499 1500 StructType *STy = dyn_cast<StructType>(Ty); 1501 if (!STy) 1502 return 0; 1503 1504 const StructLayout *SL = TD.getStructLayout(STy); 1505 uint64_t StructOffset = Offset.getZExtValue(); 1506 if (StructOffset >= SL->getSizeInBytes()) 1507 return 0; 1508 unsigned Index = SL->getElementContainingOffset(StructOffset); 1509 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1510 Type *ElementTy = STy->getElementType(Index); 1511 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1512 return 0; // The offset points into alignment padding. 1513 1514 Indices.push_back(IRB.getInt32(Index)); 1515 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1516 Indices, Prefix); 1517} 1518 1519/// \brief Get a natural GEP from a base pointer to a particular offset and 1520/// resulting in a particular type. 1521/// 1522/// The goal is to produce a "natural" looking GEP that works with the existing 1523/// composite types to arrive at the appropriate offset and element type for 1524/// a pointer. TargetTy is the element type the returned GEP should point-to if 1525/// possible. We recurse by decreasing Offset, adding the appropriate index to 1526/// Indices, and setting Ty to the result subtype. 1527/// 1528/// If no natural GEP can be constructed, this function returns null. 1529static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD, 1530 Value *Ptr, APInt Offset, Type *TargetTy, 1531 SmallVectorImpl<Value *> &Indices, 1532 const Twine &Prefix) { 1533 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1534 1535 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1536 // an i8. 1537 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1538 return 0; 1539 1540 Type *ElementTy = Ty->getElementType(); 1541 if (!ElementTy->isSized()) 1542 return 0; // We can't GEP through an unsized element. 1543 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1544 if (ElementSize == 0) 1545 return 0; // Zero-length arrays can't help us build a natural GEP. 1546 APInt NumSkippedElements = Offset.udiv(ElementSize); 1547 1548 Offset -= NumSkippedElements * ElementSize; 1549 Indices.push_back(IRB.getInt(NumSkippedElements)); 1550 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1551 Indices, Prefix); 1552} 1553 1554/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1555/// resulting pointer has PointerTy. 1556/// 1557/// This tries very hard to compute a "natural" GEP which arrives at the offset 1558/// and produces the pointer type desired. Where it cannot, it will try to use 1559/// the natural GEP to arrive at the offset and bitcast to the type. Where that 1560/// fails, it will try to use an existing i8* and GEP to the byte offset and 1561/// bitcast to the type. 1562/// 1563/// The strategy for finding the more natural GEPs is to peel off layers of the 1564/// pointer, walking back through bit casts and GEPs, searching for a base 1565/// pointer from which we can compute a natural GEP with the desired 1566/// properities. The algorithm tries to fold as many constant indices into 1567/// a single GEP as possible, thus making each GEP more independent of the 1568/// surrounding code. 1569static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD, 1570 Value *Ptr, APInt Offset, Type *PointerTy, 1571 const Twine &Prefix) { 1572 // Even though we don't look through PHI nodes, we could be called on an 1573 // instruction in an unreachable block, which may be on a cycle. 1574 SmallPtrSet<Value *, 4> Visited; 1575 Visited.insert(Ptr); 1576 SmallVector<Value *, 4> Indices; 1577 1578 // We may end up computing an offset pointer that has the wrong type. If we 1579 // never are able to compute one directly that has the correct type, we'll 1580 // fall back to it, so keep it around here. 1581 Value *OffsetPtr = 0; 1582 1583 // Remember any i8 pointer we come across to re-use if we need to do a raw 1584 // byte offset. 1585 Value *Int8Ptr = 0; 1586 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1587 1588 Type *TargetTy = PointerTy->getPointerElementType(); 1589 1590 do { 1591 // First fold any existing GEPs into the offset. 1592 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1593 APInt GEPOffset(Offset.getBitWidth(), 0); 1594 if (!accumulateGEPOffsets(TD, *GEP, GEPOffset)) 1595 break; 1596 Offset += GEPOffset; 1597 Ptr = GEP->getPointerOperand(); 1598 if (!Visited.insert(Ptr)) 1599 break; 1600 } 1601 1602 // See if we can perform a natural GEP here. 1603 Indices.clear(); 1604 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1605 Indices, Prefix)) { 1606 if (P->getType() == PointerTy) { 1607 // Zap any offset pointer that we ended up computing in previous rounds. 1608 if (OffsetPtr && OffsetPtr->use_empty()) 1609 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1610 I->eraseFromParent(); 1611 return P; 1612 } 1613 if (!OffsetPtr) { 1614 OffsetPtr = P; 1615 } 1616 } 1617 1618 // Stash this pointer if we've found an i8*. 1619 if (Ptr->getType()->isIntegerTy(8)) { 1620 Int8Ptr = Ptr; 1621 Int8PtrOffset = Offset; 1622 } 1623 1624 // Peel off a layer of the pointer and update the offset appropriately. 1625 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1626 Ptr = cast<Operator>(Ptr)->getOperand(0); 1627 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1628 if (GA->mayBeOverridden()) 1629 break; 1630 Ptr = GA->getAliasee(); 1631 } else { 1632 break; 1633 } 1634 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1635 } while (Visited.insert(Ptr)); 1636 1637 if (!OffsetPtr) { 1638 if (!Int8Ptr) { 1639 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1640 Prefix + ".raw_cast"); 1641 Int8PtrOffset = Offset; 1642 } 1643 1644 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1645 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1646 Prefix + ".raw_idx"); 1647 } 1648 Ptr = OffsetPtr; 1649 1650 // On the off chance we were targeting i8*, guard the bitcast here. 1651 if (Ptr->getType() != PointerTy) 1652 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 1653 1654 return Ptr; 1655} 1656 1657/// \brief Test whether the given alloca partition can be promoted to a vector. 1658/// 1659/// This is a quick test to check whether we can rewrite a particular alloca 1660/// partition (and its newly formed alloca) into a vector alloca with only 1661/// whole-vector loads and stores such that it could be promoted to a vector 1662/// SSA value. We only can ensure this for a limited set of operations, and we 1663/// don't want to do the rewrites unless we are confident that the result will 1664/// be promotable, so we have an early test here. 1665static bool isVectorPromotionViable(const TargetData &TD, 1666 Type *AllocaTy, 1667 AllocaPartitioning &P, 1668 uint64_t PartitionBeginOffset, 1669 uint64_t PartitionEndOffset, 1670 AllocaPartitioning::const_use_iterator I, 1671 AllocaPartitioning::const_use_iterator E) { 1672 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 1673 if (!Ty) 1674 return false; 1675 1676 uint64_t VecSize = TD.getTypeSizeInBits(Ty); 1677 uint64_t ElementSize = Ty->getScalarSizeInBits(); 1678 1679 // While the definition of LLVM vectors is bitpacked, we don't support sizes 1680 // that aren't byte sized. 1681 if (ElementSize % 8) 1682 return false; 1683 assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); 1684 VecSize /= 8; 1685 ElementSize /= 8; 1686 1687 for (; I != E; ++I) { 1688 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 1689 uint64_t BeginIndex = BeginOffset / ElementSize; 1690 if (BeginIndex * ElementSize != BeginOffset || 1691 BeginIndex >= Ty->getNumElements()) 1692 return false; 1693 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 1694 uint64_t EndIndex = EndOffset / ElementSize; 1695 if (EndIndex * ElementSize != EndOffset || 1696 EndIndex > Ty->getNumElements()) 1697 return false; 1698 1699 // FIXME: We should build shuffle vector instructions to handle 1700 // non-element-sized accesses. 1701 if ((EndOffset - BeginOffset) != ElementSize && 1702 (EndOffset - BeginOffset) != VecSize) 1703 return false; 1704 1705 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&*I->User)) { 1706 if (MI->isVolatile()) 1707 return false; 1708 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(&*I->User)) { 1709 const AllocaPartitioning::MemTransferOffsets &MTO 1710 = P.getMemTransferOffsets(*MTI); 1711 if (!MTO.IsSplittable) 1712 return false; 1713 } 1714 } else if (I->Ptr->getType()->getPointerElementType()->isStructTy()) { 1715 // Disable vector promotion when there are loads or stores of an FCA. 1716 return false; 1717 } else if (!isa<LoadInst>(*I->User) && !isa<StoreInst>(*I->User)) { 1718 return false; 1719 } 1720 } 1721 return true; 1722} 1723 1724/// \brief Test whether the given alloca partition can be promoted to an int. 1725/// 1726/// This is a quick test to check whether we can rewrite a particular alloca 1727/// partition (and its newly formed alloca) into an integer alloca suitable for 1728/// promotion to an SSA value. We only can ensure this for a limited set of 1729/// operations, and we don't want to do the rewrites unless we are confident 1730/// that the result will be promotable, so we have an early test here. 1731static bool isIntegerPromotionViable(const TargetData &TD, 1732 Type *AllocaTy, 1733 AllocaPartitioning &P, 1734 AllocaPartitioning::const_use_iterator I, 1735 AllocaPartitioning::const_use_iterator E) { 1736 IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy); 1737 if (!Ty) 1738 return false; 1739 1740 // Check the uses to ensure the uses are (likely) promoteable integer uses. 1741 // Also ensure that the alloca has a covering load or store. We don't want 1742 // promote because of some other unsplittable entry (which we may make 1743 // splittable later) and lose the ability to promote each element access. 1744 bool WholeAllocaOp = false; 1745 for (; I != E; ++I) { 1746 if (LoadInst *LI = dyn_cast<LoadInst>(&*I->User)) { 1747 if (LI->isVolatile() || !LI->getType()->isIntegerTy()) 1748 return false; 1749 if (LI->getType() == Ty) 1750 WholeAllocaOp = true; 1751 } else if (StoreInst *SI = dyn_cast<StoreInst>(&*I->User)) { 1752 if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy()) 1753 return false; 1754 if (SI->getValueOperand()->getType() == Ty) 1755 WholeAllocaOp = true; 1756 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&*I->User)) { 1757 if (MI->isVolatile()) 1758 return false; 1759 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(&*I->User)) { 1760 const AllocaPartitioning::MemTransferOffsets &MTO 1761 = P.getMemTransferOffsets(*MTI); 1762 if (!MTO.IsSplittable) 1763 return false; 1764 } 1765 } else { 1766 return false; 1767 } 1768 } 1769 return WholeAllocaOp; 1770} 1771 1772namespace { 1773/// \brief Visitor to rewrite instructions using a partition of an alloca to 1774/// use a new alloca. 1775/// 1776/// Also implements the rewriting to vector-based accesses when the partition 1777/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 1778/// lives here. 1779class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 1780 bool> { 1781 // Befriend the base class so it can delegate to private visit methods. 1782 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 1783 1784 const TargetData &TD; 1785 AllocaPartitioning &P; 1786 SROA &Pass; 1787 AllocaInst &OldAI, &NewAI; 1788 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 1789 1790 // If we are rewriting an alloca partition which can be written as pure 1791 // vector operations, we stash extra information here. When VecTy is 1792 // non-null, we have some strict guarantees about the rewriten alloca: 1793 // - The new alloca is exactly the size of the vector type here. 1794 // - The accesses all either map to the entire vector or to a single 1795 // element. 1796 // - The set of accessing instructions is only one of those handled above 1797 // in isVectorPromotionViable. Generally these are the same access kinds 1798 // which are promotable via mem2reg. 1799 VectorType *VecTy; 1800 Type *ElementTy; 1801 uint64_t ElementSize; 1802 1803 // This is a convenience and flag variable that will be null unless the new 1804 // alloca has a promotion-targeted integer type due to passing 1805 // isIntegerPromotionViable above. If it is non-null does, the desired 1806 // integer type will be stored here for easy access during rewriting. 1807 IntegerType *IntPromotionTy; 1808 1809 // The offset of the partition user currently being rewritten. 1810 uint64_t BeginOffset, EndOffset; 1811 Instruction *OldPtr; 1812 1813 // The name prefix to use when rewriting instructions for this alloca. 1814 std::string NamePrefix; 1815 1816public: 1817 AllocaPartitionRewriter(const TargetData &TD, AllocaPartitioning &P, 1818 AllocaPartitioning::iterator PI, 1819 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 1820 uint64_t NewBeginOffset, uint64_t NewEndOffset) 1821 : TD(TD), P(P), Pass(Pass), 1822 OldAI(OldAI), NewAI(NewAI), 1823 NewAllocaBeginOffset(NewBeginOffset), 1824 NewAllocaEndOffset(NewEndOffset), 1825 VecTy(), ElementTy(), ElementSize(), IntPromotionTy(), 1826 BeginOffset(), EndOffset() { 1827 } 1828 1829 /// \brief Visit the users of the alloca partition and rewrite them. 1830 bool visitUsers(AllocaPartitioning::const_use_iterator I, 1831 AllocaPartitioning::const_use_iterator E) { 1832 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 1833 NewAllocaBeginOffset, NewAllocaEndOffset, 1834 I, E)) { 1835 ++NumVectorized; 1836 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 1837 ElementTy = VecTy->getElementType(); 1838 assert((VecTy->getScalarSizeInBits() % 8) == 0 && 1839 "Only multiple-of-8 sized vector elements are viable"); 1840 ElementSize = VecTy->getScalarSizeInBits() / 8; 1841 } else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(), 1842 P, I, E)) { 1843 IntPromotionTy = cast<IntegerType>(NewAI.getAllocatedType()); 1844 } 1845 bool CanSROA = true; 1846 for (; I != E; ++I) { 1847 BeginOffset = I->BeginOffset; 1848 EndOffset = I->EndOffset; 1849 OldPtr = I->Ptr; 1850 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 1851 CanSROA &= visit(I->User); 1852 } 1853 if (VecTy) { 1854 assert(CanSROA); 1855 VecTy = 0; 1856 ElementTy = 0; 1857 ElementSize = 0; 1858 } 1859 return CanSROA; 1860 } 1861 1862private: 1863 // Every instruction which can end up as a user must have a rewrite rule. 1864 bool visitInstruction(Instruction &I) { 1865 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 1866 llvm_unreachable("No rewrite rule for this instruction!"); 1867 } 1868 1869 Twine getName(const Twine &Suffix) { 1870 return NamePrefix + Suffix; 1871 } 1872 1873 Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { 1874 assert(BeginOffset >= NewAllocaBeginOffset); 1875 APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); 1876 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 1877 } 1878 1879 ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { 1880 assert(VecTy && "Can only call getIndex when rewriting a vector"); 1881 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 1882 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 1883 uint32_t Index = RelOffset / ElementSize; 1884 assert(Index * ElementSize == RelOffset); 1885 return IRB.getInt32(Index); 1886 } 1887 1888 Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, 1889 uint64_t Offset) { 1890 assert(IntPromotionTy && "Alloca is not an integer we can extract from"); 1891 Value *V = IRB.CreateLoad(&NewAI, getName(".load")); 1892 assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); 1893 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 1894 if (RelOffset) 1895 V = IRB.CreateLShr(V, RelOffset*8, getName(".shift")); 1896 if (TargetTy != IntPromotionTy) { 1897 assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() && 1898 "Cannot extract to a larger integer!"); 1899 V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); 1900 } 1901 return V; 1902 } 1903 1904 StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { 1905 IntegerType *Ty = cast<IntegerType>(V->getType()); 1906 if (Ty == IntPromotionTy) 1907 return IRB.CreateStore(V, &NewAI); 1908 1909 assert(Ty->getBitWidth() < IntPromotionTy->getBitWidth() && 1910 "Cannot insert a larger integer!"); 1911 V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext")); 1912 assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); 1913 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 1914 if (RelOffset) 1915 V = IRB.CreateShl(V, RelOffset*8, getName(".shift")); 1916 1917 APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth()) 1918 .shl(RelOffset*8); 1919 Value *Old = IRB.CreateAnd(IRB.CreateLoad(&NewAI, getName(".oldload")), 1920 Mask, getName(".mask")); 1921 return IRB.CreateStore(IRB.CreateOr(Old, V, getName(".insert")), 1922 &NewAI); 1923 } 1924 1925 void deleteIfTriviallyDead(Value *V) { 1926 Instruction *I = cast<Instruction>(V); 1927 if (isInstructionTriviallyDead(I)) 1928 Pass.DeadInsts.push_back(I); 1929 } 1930 1931 Value *getValueCast(IRBuilder<> &IRB, Value *V, Type *Ty) { 1932 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 1933 return IRB.CreateIntToPtr(V, Ty); 1934 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 1935 return IRB.CreatePtrToInt(V, Ty); 1936 1937 return IRB.CreateBitCast(V, Ty); 1938 } 1939 1940 bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { 1941 Value *Result; 1942 if (LI.getType() == VecTy->getElementType() || 1943 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 1944 Result 1945 = IRB.CreateExtractElement(IRB.CreateLoad(&NewAI, getName(".load")), 1946 getIndex(IRB, BeginOffset), 1947 getName(".extract")); 1948 } else { 1949 Result = IRB.CreateLoad(&NewAI, getName(".load")); 1950 } 1951 if (Result->getType() != LI.getType()) 1952 Result = getValueCast(IRB, Result, LI.getType()); 1953 LI.replaceAllUsesWith(Result); 1954 Pass.DeadInsts.push_back(&LI); 1955 1956 DEBUG(dbgs() << " to: " << *Result << "\n"); 1957 return true; 1958 } 1959 1960 bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { 1961 assert(!LI.isVolatile()); 1962 Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()), 1963 BeginOffset); 1964 LI.replaceAllUsesWith(Result); 1965 Pass.DeadInsts.push_back(&LI); 1966 DEBUG(dbgs() << " to: " << *Result << "\n"); 1967 return true; 1968 } 1969 1970 bool visitLoadInst(LoadInst &LI) { 1971 DEBUG(dbgs() << " original: " << LI << "\n"); 1972 Value *OldOp = LI.getOperand(0); 1973 assert(OldOp == OldPtr); 1974 IRBuilder<> IRB(&LI); 1975 1976 if (VecTy) 1977 return rewriteVectorizedLoadInst(IRB, LI, OldOp); 1978 if (IntPromotionTy) 1979 return rewriteIntegerLoad(IRB, LI); 1980 1981 Value *NewPtr = getAdjustedAllocaPtr(IRB, 1982 LI.getPointerOperand()->getType()); 1983 LI.setOperand(0, NewPtr); 1984 DEBUG(dbgs() << " to: " << LI << "\n"); 1985 1986 deleteIfTriviallyDead(OldOp); 1987 return NewPtr == &NewAI && !LI.isVolatile(); 1988 } 1989 1990 bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI, 1991 Value *OldOp) { 1992 Value *V = SI.getValueOperand(); 1993 if (V->getType() == ElementTy || 1994 BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { 1995 if (V->getType() != ElementTy) 1996 V = getValueCast(IRB, V, ElementTy); 1997 V = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V, 1998 getIndex(IRB, BeginOffset), 1999 getName(".insert")); 2000 } else if (V->getType() != VecTy) { 2001 V = getValueCast(IRB, V, VecTy); 2002 } 2003 StoreInst *Store = IRB.CreateStore(V, &NewAI); 2004 Pass.DeadInsts.push_back(&SI); 2005 2006 (void)Store; 2007 DEBUG(dbgs() << " to: " << *Store << "\n"); 2008 return true; 2009 } 2010 2011 bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { 2012 assert(!SI.isVolatile()); 2013 StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset); 2014 Pass.DeadInsts.push_back(&SI); 2015 (void)Store; 2016 DEBUG(dbgs() << " to: " << *Store << "\n"); 2017 return true; 2018 } 2019 2020 bool visitStoreInst(StoreInst &SI) { 2021 DEBUG(dbgs() << " original: " << SI << "\n"); 2022 Value *OldOp = SI.getOperand(1); 2023 assert(OldOp == OldPtr); 2024 IRBuilder<> IRB(&SI); 2025 2026 if (VecTy) 2027 return rewriteVectorizedStoreInst(IRB, SI, OldOp); 2028 if (IntPromotionTy) 2029 return rewriteIntegerStore(IRB, SI); 2030 2031 Value *NewPtr = getAdjustedAllocaPtr(IRB, 2032 SI.getPointerOperand()->getType()); 2033 SI.setOperand(1, NewPtr); 2034 DEBUG(dbgs() << " to: " << SI << "\n"); 2035 2036 deleteIfTriviallyDead(OldOp); 2037 return NewPtr == &NewAI && !SI.isVolatile(); 2038 } 2039 2040 bool visitMemSetInst(MemSetInst &II) { 2041 DEBUG(dbgs() << " original: " << II << "\n"); 2042 IRBuilder<> IRB(&II); 2043 assert(II.getRawDest() == OldPtr); 2044 2045 // If the memset has a variable size, it cannot be split, just adjust the 2046 // pointer to the new alloca. 2047 if (!isa<Constant>(II.getLength())) { 2048 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2049 deleteIfTriviallyDead(OldPtr); 2050 return false; 2051 } 2052 2053 // Record this instruction for deletion. 2054 if (Pass.DeadSplitInsts.insert(&II)) 2055 Pass.DeadInsts.push_back(&II); 2056 2057 Type *AllocaTy = NewAI.getAllocatedType(); 2058 Type *ScalarTy = AllocaTy->getScalarType(); 2059 2060 // If this doesn't map cleanly onto the alloca type, and that type isn't 2061 // a single value type, just emit a memset. 2062 if (!VecTy && (BeginOffset != NewAllocaBeginOffset || 2063 EndOffset != NewAllocaEndOffset || 2064 !AllocaTy->isSingleValueType() || 2065 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { 2066 Type *SizeTy = II.getLength()->getType(); 2067 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2068 2069 CallInst *New 2070 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2071 II.getRawDest()->getType()), 2072 II.getValue(), Size, II.getAlignment(), 2073 II.isVolatile()); 2074 (void)New; 2075 DEBUG(dbgs() << " to: " << *New << "\n"); 2076 return false; 2077 } 2078 2079 // If we can represent this as a simple value, we have to build the actual 2080 // value to store, which requires expanding the byte present in memset to 2081 // a sensible representation for the alloca type. This is essentially 2082 // splatting the byte to a sufficiently wide integer, bitcasting to the 2083 // desired scalar type, and splatting it across any desired vector type. 2084 Value *V = II.getValue(); 2085 IntegerType *VTy = cast<IntegerType>(V->getType()); 2086 Type *IntTy = Type::getIntNTy(VTy->getContext(), 2087 TD.getTypeSizeInBits(ScalarTy)); 2088 if (TD.getTypeSizeInBits(ScalarTy) > VTy->getBitWidth()) 2089 V = IRB.CreateMul(IRB.CreateZExt(V, IntTy, getName(".zext")), 2090 ConstantExpr::getUDiv( 2091 Constant::getAllOnesValue(IntTy), 2092 ConstantExpr::getZExt( 2093 Constant::getAllOnesValue(V->getType()), 2094 IntTy)), 2095 getName(".isplat")); 2096 if (V->getType() != ScalarTy) { 2097 if (ScalarTy->isPointerTy()) 2098 V = IRB.CreateIntToPtr(V, ScalarTy); 2099 else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy()) 2100 V = IRB.CreateBitCast(V, ScalarTy); 2101 else if (ScalarTy->isIntegerTy()) 2102 llvm_unreachable("Computed different integer types with equal widths"); 2103 else 2104 llvm_unreachable("Invalid scalar type"); 2105 } 2106 2107 // If this is an element-wide memset of a vectorizable alloca, insert it. 2108 if (VecTy && (BeginOffset > NewAllocaBeginOffset || 2109 EndOffset < NewAllocaEndOffset)) { 2110 StoreInst *Store = IRB.CreateStore( 2111 IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V, 2112 getIndex(IRB, BeginOffset), 2113 getName(".insert")), 2114 &NewAI); 2115 (void)Store; 2116 DEBUG(dbgs() << " to: " << *Store << "\n"); 2117 return true; 2118 } 2119 2120 // Splat to a vector if needed. 2121 if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)) { 2122 VectorType *SplatSourceTy = VectorType::get(V->getType(), 1); 2123 V = IRB.CreateShuffleVector( 2124 IRB.CreateInsertElement(UndefValue::get(SplatSourceTy), V, 2125 IRB.getInt32(0), getName(".vsplat.insert")), 2126 UndefValue::get(SplatSourceTy), 2127 ConstantVector::getSplat(VecTy->getNumElements(), IRB.getInt32(0)), 2128 getName(".vsplat.shuffle")); 2129 assert(V->getType() == VecTy); 2130 } 2131 2132 Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile()); 2133 (void)New; 2134 DEBUG(dbgs() << " to: " << *New << "\n"); 2135 return !II.isVolatile(); 2136 } 2137 2138 bool visitMemTransferInst(MemTransferInst &II) { 2139 // Rewriting of memory transfer instructions can be a bit tricky. We break 2140 // them into two categories: split intrinsics and unsplit intrinsics. 2141 2142 DEBUG(dbgs() << " original: " << II << "\n"); 2143 IRBuilder<> IRB(&II); 2144 2145 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2146 bool IsDest = II.getRawDest() == OldPtr; 2147 2148 const AllocaPartitioning::MemTransferOffsets &MTO 2149 = P.getMemTransferOffsets(II); 2150 2151 // For unsplit intrinsics, we simply modify the source and destination 2152 // pointers in place. This isn't just an optimization, it is a matter of 2153 // correctness. With unsplit intrinsics we may be dealing with transfers 2154 // within a single alloca before SROA ran, or with transfers that have 2155 // a variable length. We may also be dealing with memmove instead of 2156 // memcpy, and so simply updating the pointers is the necessary for us to 2157 // update both source and dest of a single call. 2158 if (!MTO.IsSplittable) { 2159 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2160 if (IsDest) 2161 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2162 else 2163 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2164 2165 DEBUG(dbgs() << " to: " << II << "\n"); 2166 deleteIfTriviallyDead(OldOp); 2167 return false; 2168 } 2169 // For split transfer intrinsics we have an incredibly useful assurance: 2170 // the source and destination do not reside within the same alloca, and at 2171 // least one of them does not escape. This means that we can replace 2172 // memmove with memcpy, and we don't need to worry about all manner of 2173 // downsides to splitting and transforming the operations. 2174 2175 // Compute the relative offset within the transfer. 2176 unsigned IntPtrWidth = TD.getPointerSizeInBits(); 2177 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2178 : MTO.SourceBegin)); 2179 2180 // If this doesn't map cleanly onto the alloca type, and that type isn't 2181 // a single value type, just emit a memcpy. 2182 bool EmitMemCpy 2183 = !VecTy && (BeginOffset != NewAllocaBeginOffset || 2184 EndOffset != NewAllocaEndOffset || 2185 !NewAI.getAllocatedType()->isSingleValueType()); 2186 2187 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2188 // size hasn't been shrunk based on analysis of the viable range, this is 2189 // a no-op. 2190 if (EmitMemCpy && &OldAI == &NewAI) { 2191 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2192 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2193 // Ensure the start lines up. 2194 assert(BeginOffset == OrigBegin); 2195 (void)OrigBegin; 2196 2197 // Rewrite the size as needed. 2198 if (EndOffset != OrigEnd) 2199 II.setLength(ConstantInt::get(II.getLength()->getType(), 2200 EndOffset - BeginOffset)); 2201 return false; 2202 } 2203 // Record this instruction for deletion. 2204 if (Pass.DeadSplitInsts.insert(&II)) 2205 Pass.DeadInsts.push_back(&II); 2206 2207 bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset || 2208 EndOffset < NewAllocaEndOffset); 2209 2210 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2211 : II.getRawDest()->getType(); 2212 if (!EmitMemCpy) 2213 OtherPtrTy = IsVectorElement ? VecTy->getElementType()->getPointerTo() 2214 : NewAI.getType(); 2215 2216 // Compute the other pointer, folding as much as possible to produce 2217 // a single, simple GEP in most cases. 2218 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2219 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2220 getName("." + OtherPtr->getName())); 2221 2222 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2223 // alloca that should be re-examined after rewriting this instruction. 2224 if (AllocaInst *AI 2225 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2226 Pass.Worklist.insert(AI); 2227 2228 if (EmitMemCpy) { 2229 Value *OurPtr 2230 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2231 : II.getRawSource()->getType()); 2232 Type *SizeTy = II.getLength()->getType(); 2233 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2234 2235 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2236 IsDest ? OtherPtr : OurPtr, 2237 Size, II.getAlignment(), 2238 II.isVolatile()); 2239 (void)New; 2240 DEBUG(dbgs() << " to: " << *New << "\n"); 2241 return false; 2242 } 2243 2244 Value *SrcPtr = OtherPtr; 2245 Value *DstPtr = &NewAI; 2246 if (!IsDest) 2247 std::swap(SrcPtr, DstPtr); 2248 2249 Value *Src; 2250 if (IsVectorElement && !IsDest) { 2251 // We have to extract rather than load. 2252 Src = IRB.CreateExtractElement(IRB.CreateLoad(SrcPtr, 2253 getName(".copyload")), 2254 getIndex(IRB, BeginOffset), 2255 getName(".copyextract")); 2256 } else { 2257 Src = IRB.CreateLoad(SrcPtr, II.isVolatile(), getName(".copyload")); 2258 } 2259 2260 if (IsVectorElement && IsDest) { 2261 // We have to insert into a loaded copy before storing. 2262 Src = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), 2263 Src, getIndex(IRB, BeginOffset), 2264 getName(".insert")); 2265 } 2266 2267 Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile()); 2268 (void)Store; 2269 DEBUG(dbgs() << " to: " << *Store << "\n"); 2270 return !II.isVolatile(); 2271 } 2272 2273 bool visitIntrinsicInst(IntrinsicInst &II) { 2274 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2275 II.getIntrinsicID() == Intrinsic::lifetime_end); 2276 DEBUG(dbgs() << " original: " << II << "\n"); 2277 IRBuilder<> IRB(&II); 2278 assert(II.getArgOperand(1) == OldPtr); 2279 2280 // Record this instruction for deletion. 2281 if (Pass.DeadSplitInsts.insert(&II)) 2282 Pass.DeadInsts.push_back(&II); 2283 2284 ConstantInt *Size 2285 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2286 EndOffset - BeginOffset); 2287 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 2288 Value *New; 2289 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 2290 New = IRB.CreateLifetimeStart(Ptr, Size); 2291 else 2292 New = IRB.CreateLifetimeEnd(Ptr, Size); 2293 2294 DEBUG(dbgs() << " to: " << *New << "\n"); 2295 return true; 2296 } 2297 2298 /// PHI instructions that use an alloca and are subsequently loaded can be 2299 /// rewritten to load both input pointers in the pred blocks and then PHI the 2300 /// results, allowing the load of the alloca to be promoted. 2301 /// From this: 2302 /// %P2 = phi [i32* %Alloca, i32* %Other] 2303 /// %V = load i32* %P2 2304 /// to: 2305 /// %V1 = load i32* %Alloca -> will be mem2reg'd 2306 /// ... 2307 /// %V2 = load i32* %Other 2308 /// ... 2309 /// %V = phi [i32 %V1, i32 %V2] 2310 /// 2311 /// We can do this to a select if its only uses are loads and if the operand 2312 /// to the select can be loaded unconditionally. 2313 /// 2314 /// FIXME: This should be hoisted into a generic utility, likely in 2315 /// Transforms/Util/Local.h 2316 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 2317 // For now, we can only do this promotion if the load is in the same block 2318 // as the PHI, and if there are no stores between the phi and load. 2319 // TODO: Allow recursive phi users. 2320 // TODO: Allow stores. 2321 BasicBlock *BB = PN.getParent(); 2322 unsigned MaxAlign = 0; 2323 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 2324 UI != UE; ++UI) { 2325 LoadInst *LI = dyn_cast<LoadInst>(*UI); 2326 if (LI == 0 || !LI->isSimple()) return false; 2327 2328 // For now we only allow loads in the same block as the PHI. This is 2329 // a common case that happens when instcombine merges two loads through 2330 // a PHI. 2331 if (LI->getParent() != BB) return false; 2332 2333 // Ensure that there are no instructions between the PHI and the load that 2334 // could store. 2335 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 2336 if (BBI->mayWriteToMemory()) 2337 return false; 2338 2339 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 2340 Loads.push_back(LI); 2341 } 2342 2343 // We can only transform this if it is safe to push the loads into the 2344 // predecessor blocks. The only thing to watch out for is that we can't put 2345 // a possibly trapping load in the predecessor if it is a critical edge. 2346 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; 2347 ++Idx) { 2348 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 2349 Value *InVal = PN.getIncomingValue(Idx); 2350 2351 // If the value is produced by the terminator of the predecessor (an 2352 // invoke) or it has side-effects, there is no valid place to put a load 2353 // in the predecessor. 2354 if (TI == InVal || TI->mayHaveSideEffects()) 2355 return false; 2356 2357 // If the predecessor has a single successor, then the edge isn't 2358 // critical. 2359 if (TI->getNumSuccessors() == 1) 2360 continue; 2361 2362 // If this pointer is always safe to load, or if we can prove that there 2363 // is already a load in the block, then we can move the load to the pred 2364 // block. 2365 if (InVal->isDereferenceablePointer() || 2366 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 2367 continue; 2368 2369 return false; 2370 } 2371 2372 return true; 2373 } 2374 2375 bool visitPHINode(PHINode &PN) { 2376 DEBUG(dbgs() << " original: " << PN << "\n"); 2377 // We would like to compute a new pointer in only one place, but have it be 2378 // as local as possible to the PHI. To do that, we re-use the location of 2379 // the old pointer, which necessarily must be in the right position to 2380 // dominate the PHI. 2381 IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); 2382 2383 SmallVector<LoadInst *, 4> Loads; 2384 if (!isSafePHIToSpeculate(PN, Loads)) { 2385 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2386 // Replace the operands which were using the old pointer. 2387 User::op_iterator OI = PN.op_begin(), OE = PN.op_end(); 2388 for (; OI != OE; ++OI) 2389 if (*OI == OldPtr) 2390 *OI = NewPtr; 2391 2392 DEBUG(dbgs() << " to: " << PN << "\n"); 2393 deleteIfTriviallyDead(OldPtr); 2394 return false; 2395 } 2396 assert(!Loads.empty()); 2397 2398 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 2399 IRBuilder<> PHIBuilder(&PN); 2400 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues()); 2401 NewPN->takeName(&PN); 2402 2403 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 2404 // matter which one we get and if any differ, it doesn't matter. 2405 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 2406 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 2407 unsigned Align = SomeLoad->getAlignment(); 2408 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 2409 2410 // Rewrite all loads of the PN to use the new PHI. 2411 do { 2412 LoadInst *LI = Loads.pop_back_val(); 2413 LI->replaceAllUsesWith(NewPN); 2414 Pass.DeadInsts.push_back(LI); 2415 } while (!Loads.empty()); 2416 2417 // Inject loads into all of the pred blocks. 2418 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 2419 BasicBlock *Pred = PN.getIncomingBlock(Idx); 2420 TerminatorInst *TI = Pred->getTerminator(); 2421 Value *InVal = PN.getIncomingValue(Idx); 2422 IRBuilder<> PredBuilder(TI); 2423 2424 // Map the value to the new alloca pointer if this was the old alloca 2425 // pointer. 2426 bool ThisOperand = InVal == OldPtr; 2427 if (ThisOperand) 2428 InVal = NewPtr; 2429 2430 LoadInst *Load 2431 = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." + 2432 Pred->getName())); 2433 ++NumLoadsSpeculated; 2434 Load->setAlignment(Align); 2435 if (TBAATag) 2436 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 2437 NewPN->addIncoming(Load, Pred); 2438 2439 if (ThisOperand) 2440 continue; 2441 Instruction *OtherPtr = dyn_cast<Instruction>(InVal); 2442 if (!OtherPtr) 2443 // No uses to rewrite. 2444 continue; 2445 2446 // Try to lookup and rewrite any partition uses corresponding to this phi 2447 // input. 2448 AllocaPartitioning::iterator PI 2449 = P.findPartitionForPHIOrSelectOperand(PN, OtherPtr); 2450 if (PI != P.end()) { 2451 // If the other pointer is within the partitioning, replace the PHI in 2452 // its uses with the load we just speculated, or add another load for 2453 // it to rewrite if we've already replaced the PHI. 2454 AllocaPartitioning::use_iterator UI 2455 = P.findPartitionUseForPHIOrSelectOperand(PN, OtherPtr); 2456 if (isa<PHINode>(*UI->User)) 2457 UI->User = Load; 2458 else { 2459 AllocaPartitioning::PartitionUse OtherUse = *UI; 2460 OtherUse.User = Load; 2461 P.use_push_back(PI, OtherUse); 2462 } 2463 } 2464 } 2465 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 2466 return NewPtr == &NewAI; 2467 } 2468 2469 /// Select instructions that use an alloca and are subsequently loaded can be 2470 /// rewritten to load both input pointers and then select between the result, 2471 /// allowing the load of the alloca to be promoted. 2472 /// From this: 2473 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 2474 /// %V = load i32* %P2 2475 /// to: 2476 /// %V1 = load i32* %Alloca -> will be mem2reg'd 2477 /// %V2 = load i32* %Other 2478 /// %V = select i1 %cond, i32 %V1, i32 %V2 2479 /// 2480 /// We can do this to a select if its only uses are loads and if the operand 2481 /// to the select can be loaded unconditionally. 2482 bool isSafeSelectToSpeculate(SelectInst &SI, 2483 SmallVectorImpl<LoadInst *> &Loads) { 2484 Value *TValue = SI.getTrueValue(); 2485 Value *FValue = SI.getFalseValue(); 2486 bool TDerefable = TValue->isDereferenceablePointer(); 2487 bool FDerefable = FValue->isDereferenceablePointer(); 2488 2489 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 2490 UI != UE; ++UI) { 2491 LoadInst *LI = dyn_cast<LoadInst>(*UI); 2492 if (LI == 0 || !LI->isSimple()) return false; 2493 2494 // Both operands to the select need to be dereferencable, either 2495 // absolutely (e.g. allocas) or at this point because we can see other 2496 // accesses to it. 2497 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 2498 LI->getAlignment(), &TD)) 2499 return false; 2500 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 2501 LI->getAlignment(), &TD)) 2502 return false; 2503 Loads.push_back(LI); 2504 } 2505 2506 return true; 2507 } 2508 2509 bool visitSelectInst(SelectInst &SI) { 2510 DEBUG(dbgs() << " original: " << SI << "\n"); 2511 IRBuilder<> IRB(&SI); 2512 2513 // Find the operand we need to rewrite here. 2514 bool IsTrueVal = SI.getTrueValue() == OldPtr; 2515 if (IsTrueVal) 2516 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 2517 else 2518 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 2519 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 2520 2521 // If the select isn't safe to speculate, just use simple logic to emit it. 2522 SmallVector<LoadInst *, 4> Loads; 2523 if (!isSafeSelectToSpeculate(SI, Loads)) { 2524 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 2525 DEBUG(dbgs() << " to: " << SI << "\n"); 2526 deleteIfTriviallyDead(OldPtr); 2527 return false; 2528 } 2529 2530 Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue(); 2531 AllocaPartitioning::iterator PI 2532 = P.findPartitionForPHIOrSelectOperand(SI, OtherPtr); 2533 AllocaPartitioning::PartitionUse OtherUse; 2534 if (PI != P.end()) { 2535 // If the other pointer is within the partitioning, remove the select 2536 // from its uses. We'll add in the new loads below. 2537 AllocaPartitioning::use_iterator UI 2538 = P.findPartitionUseForPHIOrSelectOperand(SI, OtherPtr); 2539 OtherUse = *UI; 2540 P.use_erase(PI, UI); 2541 } 2542 2543 Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue(); 2544 Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr; 2545 // Replace the loads of the select with a select of two loads. 2546 while (!Loads.empty()) { 2547 LoadInst *LI = Loads.pop_back_val(); 2548 2549 IRB.SetInsertPoint(LI); 2550 LoadInst *TL = 2551 IRB.CreateLoad(TV, getName("." + LI->getName() + ".true")); 2552 LoadInst *FL = 2553 IRB.CreateLoad(FV, getName("." + LI->getName() + ".false")); 2554 NumLoadsSpeculated += 2; 2555 if (PI != P.end()) { 2556 LoadInst *OtherLoad = IsTrueVal ? FL : TL; 2557 assert(OtherUse.Ptr == OtherLoad->getOperand(0)); 2558 OtherUse.User = OtherLoad; 2559 P.use_push_back(PI, OtherUse); 2560 } 2561 2562 // Transfer alignment and TBAA info if present. 2563 TL->setAlignment(LI->getAlignment()); 2564 FL->setAlignment(LI->getAlignment()); 2565 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 2566 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 2567 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 2568 } 2569 2570 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL); 2571 V->takeName(LI); 2572 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 2573 LI->replaceAllUsesWith(V); 2574 Pass.DeadInsts.push_back(LI); 2575 } 2576 2577 deleteIfTriviallyDead(OldPtr); 2578 return NewPtr == &NewAI; 2579 } 2580 2581}; 2582} 2583 2584namespace { 2585/// \brief Visitor to rewrite aggregate loads and stores as scalar. 2586/// 2587/// This pass aggressively rewrites all aggregate loads and stores on 2588/// a particular pointer (or any pointer derived from it which we can identify) 2589/// with scalar loads and stores. 2590class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 2591 // Befriend the base class so it can delegate to private visit methods. 2592 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 2593 2594 const TargetData &TD; 2595 2596 /// Queue of pointer uses to analyze and potentially rewrite. 2597 SmallVector<Use *, 8> Queue; 2598 2599 /// Set to prevent us from cycling with phi nodes and loops. 2600 SmallPtrSet<User *, 8> Visited; 2601 2602 /// The current pointer use being rewritten. This is used to dig up the used 2603 /// value (as opposed to the user). 2604 Use *U; 2605 2606public: 2607 AggLoadStoreRewriter(const TargetData &TD) : TD(TD) {} 2608 2609 /// Rewrite loads and stores through a pointer and all pointers derived from 2610 /// it. 2611 bool rewrite(Instruction &I) { 2612 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 2613 enqueueUsers(I); 2614 bool Changed = false; 2615 while (!Queue.empty()) { 2616 U = Queue.pop_back_val(); 2617 Changed |= visit(cast<Instruction>(U->getUser())); 2618 } 2619 return Changed; 2620 } 2621 2622private: 2623 /// Enqueue all the users of the given instruction for further processing. 2624 /// This uses a set to de-duplicate users. 2625 void enqueueUsers(Instruction &I) { 2626 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 2627 ++UI) 2628 if (Visited.insert(*UI)) 2629 Queue.push_back(&UI.getUse()); 2630 } 2631 2632 // Conservative default is to not rewrite anything. 2633 bool visitInstruction(Instruction &I) { return false; } 2634 2635 /// \brief Generic recursive split emission class. 2636 template <typename Derived> 2637 class OpSplitter { 2638 protected: 2639 /// The builder used to form new instructions. 2640 IRBuilder<> IRB; 2641 /// The indices which to be used with insert- or extractvalue to select the 2642 /// appropriate value within the aggregate. 2643 SmallVector<unsigned, 4> Indices; 2644 /// The indices to a GEP instruction which will move Ptr to the correct slot 2645 /// within the aggregate. 2646 SmallVector<Value *, 4> GEPIndices; 2647 /// The base pointer of the original op, used as a base for GEPing the 2648 /// split operations. 2649 Value *Ptr; 2650 2651 /// Initialize the splitter with an insertion point, Ptr and start with a 2652 /// single zero GEP index. 2653 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 2654 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 2655 2656 public: 2657 /// \brief Generic recursive split emission routine. 2658 /// 2659 /// This method recursively splits an aggregate op (load or store) into 2660 /// scalar or vector ops. It splits recursively until it hits a single value 2661 /// and emits that single value operation via the template argument. 2662 /// 2663 /// The logic of this routine relies on GEPs and insertvalue and 2664 /// extractvalue all operating with the same fundamental index list, merely 2665 /// formatted differently (GEPs need actual values). 2666 /// 2667 /// \param Ty The type being split recursively into smaller ops. 2668 /// \param Agg The aggregate value being built up or stored, depending on 2669 /// whether this is splitting a load or a store respectively. 2670 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 2671 if (Ty->isSingleValueType()) 2672 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 2673 2674 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 2675 unsigned OldSize = Indices.size(); 2676 (void)OldSize; 2677 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 2678 ++Idx) { 2679 assert(Indices.size() == OldSize && "Did not return to the old size"); 2680 Indices.push_back(Idx); 2681 GEPIndices.push_back(IRB.getInt32(Idx)); 2682 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 2683 GEPIndices.pop_back(); 2684 Indices.pop_back(); 2685 } 2686 return; 2687 } 2688 2689 if (StructType *STy = dyn_cast<StructType>(Ty)) { 2690 unsigned OldSize = Indices.size(); 2691 (void)OldSize; 2692 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 2693 ++Idx) { 2694 assert(Indices.size() == OldSize && "Did not return to the old size"); 2695 Indices.push_back(Idx); 2696 GEPIndices.push_back(IRB.getInt32(Idx)); 2697 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 2698 GEPIndices.pop_back(); 2699 Indices.pop_back(); 2700 } 2701 return; 2702 } 2703 2704 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 2705 } 2706 }; 2707 2708 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 2709 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 2710 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 2711 2712 /// Emit a leaf load of a single value. This is called at the leaves of the 2713 /// recursive emission to actually load values. 2714 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 2715 assert(Ty->isSingleValueType()); 2716 // Load the single value and insert it using the indices. 2717 Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, 2718 Name + ".gep"), 2719 Name + ".load"); 2720 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 2721 DEBUG(dbgs() << " to: " << *Load << "\n"); 2722 } 2723 }; 2724 2725 bool visitLoadInst(LoadInst &LI) { 2726 assert(LI.getPointerOperand() == *U); 2727 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 2728 return false; 2729 2730 // We have an aggregate being loaded, split it apart. 2731 DEBUG(dbgs() << " original: " << LI << "\n"); 2732 LoadOpSplitter Splitter(&LI, *U); 2733 Value *V = UndefValue::get(LI.getType()); 2734 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 2735 LI.replaceAllUsesWith(V); 2736 LI.eraseFromParent(); 2737 return true; 2738 } 2739 2740 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 2741 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 2742 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 2743 2744 /// Emit a leaf store of a single value. This is called at the leaves of the 2745 /// recursive emission to actually produce stores. 2746 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 2747 assert(Ty->isSingleValueType()); 2748 // Extract the single value and store it using the indices. 2749 Value *Store = IRB.CreateStore( 2750 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 2751 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 2752 (void)Store; 2753 DEBUG(dbgs() << " to: " << *Store << "\n"); 2754 } 2755 }; 2756 2757 bool visitStoreInst(StoreInst &SI) { 2758 if (!SI.isSimple() || SI.getPointerOperand() != *U) 2759 return false; 2760 Value *V = SI.getValueOperand(); 2761 if (V->getType()->isSingleValueType()) 2762 return false; 2763 2764 // We have an aggregate being stored, split it apart. 2765 DEBUG(dbgs() << " original: " << SI << "\n"); 2766 StoreOpSplitter Splitter(&SI, *U); 2767 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 2768 SI.eraseFromParent(); 2769 return true; 2770 } 2771 2772 bool visitBitCastInst(BitCastInst &BC) { 2773 enqueueUsers(BC); 2774 return false; 2775 } 2776 2777 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 2778 enqueueUsers(GEPI); 2779 return false; 2780 } 2781 2782 bool visitPHINode(PHINode &PN) { 2783 enqueueUsers(PN); 2784 return false; 2785 } 2786 2787 bool visitSelectInst(SelectInst &SI) { 2788 enqueueUsers(SI); 2789 return false; 2790 } 2791}; 2792} 2793 2794/// \brief Try to find a partition of the aggregate type passed in for a given 2795/// offset and size. 2796/// 2797/// This recurses through the aggregate type and tries to compute a subtype 2798/// based on the offset and size. When the offset and size span a sub-section 2799/// of an array, it will even compute a new array type for that sub-section, 2800/// and the same for structs. 2801/// 2802/// Note that this routine is very strict and tries to find a partition of the 2803/// type which produces the *exact* right offset and size. It is not forgiving 2804/// when the size or offset cause either end of type-based partition to be off. 2805/// Also, this is a best-effort routine. It is reasonable to give up and not 2806/// return a type if necessary. 2807static Type *getTypePartition(const TargetData &TD, Type *Ty, 2808 uint64_t Offset, uint64_t Size) { 2809 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 2810 return Ty; 2811 2812 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 2813 // We can't partition pointers... 2814 if (SeqTy->isPointerTy()) 2815 return 0; 2816 2817 Type *ElementTy = SeqTy->getElementType(); 2818 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 2819 uint64_t NumSkippedElements = Offset / ElementSize; 2820 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 2821 if (NumSkippedElements >= ArrTy->getNumElements()) 2822 return 0; 2823 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 2824 if (NumSkippedElements >= VecTy->getNumElements()) 2825 return 0; 2826 Offset -= NumSkippedElements * ElementSize; 2827 2828 // First check if we need to recurse. 2829 if (Offset > 0 || Size < ElementSize) { 2830 // Bail if the partition ends in a different array element. 2831 if ((Offset + Size) > ElementSize) 2832 return 0; 2833 // Recurse through the element type trying to peel off offset bytes. 2834 return getTypePartition(TD, ElementTy, Offset, Size); 2835 } 2836 assert(Offset == 0); 2837 2838 if (Size == ElementSize) 2839 return ElementTy; 2840 assert(Size > ElementSize); 2841 uint64_t NumElements = Size / ElementSize; 2842 if (NumElements * ElementSize != Size) 2843 return 0; 2844 return ArrayType::get(ElementTy, NumElements); 2845 } 2846 2847 StructType *STy = dyn_cast<StructType>(Ty); 2848 if (!STy) 2849 return 0; 2850 2851 const StructLayout *SL = TD.getStructLayout(STy); 2852 if (Offset >= SL->getSizeInBytes()) 2853 return 0; 2854 uint64_t EndOffset = Offset + Size; 2855 if (EndOffset > SL->getSizeInBytes()) 2856 return 0; 2857 2858 unsigned Index = SL->getElementContainingOffset(Offset); 2859 Offset -= SL->getElementOffset(Index); 2860 2861 Type *ElementTy = STy->getElementType(Index); 2862 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 2863 if (Offset >= ElementSize) 2864 return 0; // The offset points into alignment padding. 2865 2866 // See if any partition must be contained by the element. 2867 if (Offset > 0 || Size < ElementSize) { 2868 if ((Offset + Size) > ElementSize) 2869 return 0; 2870 return getTypePartition(TD, ElementTy, Offset, Size); 2871 } 2872 assert(Offset == 0); 2873 2874 if (Size == ElementSize) 2875 return ElementTy; 2876 2877 StructType::element_iterator EI = STy->element_begin() + Index, 2878 EE = STy->element_end(); 2879 if (EndOffset < SL->getSizeInBytes()) { 2880 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 2881 if (Index == EndIndex) 2882 return 0; // Within a single element and its padding. 2883 2884 // Don't try to form "natural" types if the elements don't line up with the 2885 // expected size. 2886 // FIXME: We could potentially recurse down through the last element in the 2887 // sub-struct to find a natural end point. 2888 if (SL->getElementOffset(EndIndex) != EndOffset) 2889 return 0; 2890 2891 assert(Index < EndIndex); 2892 EE = STy->element_begin() + EndIndex; 2893 } 2894 2895 // Try to build up a sub-structure. 2896 SmallVector<Type *, 4> ElementTys; 2897 do { 2898 ElementTys.push_back(*EI++); 2899 } while (EI != EE); 2900 StructType *SubTy = StructType::get(STy->getContext(), ElementTys, 2901 STy->isPacked()); 2902 const StructLayout *SubSL = TD.getStructLayout(SubTy); 2903 if (Size != SubSL->getSizeInBytes()) 2904 return 0; // The sub-struct doesn't have quite the size needed. 2905 2906 return SubTy; 2907} 2908 2909/// \brief Rewrite an alloca partition's users. 2910/// 2911/// This routine drives both of the rewriting goals of the SROA pass. It tries 2912/// to rewrite uses of an alloca partition to be conducive for SSA value 2913/// promotion. If the partition needs a new, more refined alloca, this will 2914/// build that new alloca, preserving as much type information as possible, and 2915/// rewrite the uses of the old alloca to point at the new one and have the 2916/// appropriate new offsets. It also evaluates how successful the rewrite was 2917/// at enabling promotion and if it was successful queues the alloca to be 2918/// promoted. 2919bool SROA::rewriteAllocaPartition(AllocaInst &AI, 2920 AllocaPartitioning &P, 2921 AllocaPartitioning::iterator PI) { 2922 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 2923 if (P.use_begin(PI) == P.use_end(PI)) 2924 return false; // No live uses left of this partition. 2925 2926 // Try to compute a friendly type for this partition of the alloca. This 2927 // won't always succeed, in which case we fall back to a legal integer type 2928 // or an i8 array of an appropriate size. 2929 Type *AllocaTy = 0; 2930 if (Type *PartitionTy = P.getCommonType(PI)) 2931 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 2932 AllocaTy = PartitionTy; 2933 if (!AllocaTy) 2934 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 2935 PI->BeginOffset, AllocaSize)) 2936 AllocaTy = PartitionTy; 2937 if ((!AllocaTy || 2938 (AllocaTy->isArrayTy() && 2939 AllocaTy->getArrayElementType()->isIntegerTy())) && 2940 TD->isLegalInteger(AllocaSize * 8)) 2941 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 2942 if (!AllocaTy) 2943 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 2944 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 2945 2946 // Check for the case where we're going to rewrite to a new alloca of the 2947 // exact same type as the original, and with the same access offsets. In that 2948 // case, re-use the existing alloca, but still run through the rewriter to 2949 // performe phi and select speculation. 2950 AllocaInst *NewAI; 2951 if (AllocaTy == AI.getAllocatedType()) { 2952 assert(PI->BeginOffset == 0 && 2953 "Non-zero begin offset but same alloca type"); 2954 assert(PI == P.begin() && "Begin offset is zero on later partition"); 2955 NewAI = &AI; 2956 } else { 2957 // FIXME: The alignment here is overly conservative -- we could in many 2958 // cases get away with much weaker alignment constraints. 2959 NewAI = new AllocaInst(AllocaTy, 0, AI.getAlignment(), 2960 AI.getName() + ".sroa." + Twine(PI - P.begin()), 2961 &AI); 2962 ++NumNewAllocas; 2963 } 2964 2965 DEBUG(dbgs() << "Rewriting alloca partition " 2966 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 2967 << *NewAI << "\n"); 2968 2969 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 2970 PI->BeginOffset, PI->EndOffset); 2971 DEBUG(dbgs() << " rewriting "); 2972 DEBUG(P.print(dbgs(), PI, "")); 2973 if (Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI))) { 2974 DEBUG(dbgs() << " and queuing for promotion\n"); 2975 PromotableAllocas.push_back(NewAI); 2976 } else if (NewAI != &AI) { 2977 // If we can't promote the alloca, iterate on it to check for new 2978 // refinements exposed by splitting the current alloca. Don't iterate on an 2979 // alloca which didn't actually change and didn't get promoted. 2980 Worklist.insert(NewAI); 2981 } 2982 return true; 2983} 2984 2985/// \brief Walks the partitioning of an alloca rewriting uses of each partition. 2986bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 2987 bool Changed = false; 2988 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 2989 ++PI) 2990 Changed |= rewriteAllocaPartition(AI, P, PI); 2991 2992 return Changed; 2993} 2994 2995/// \brief Analyze an alloca for SROA. 2996/// 2997/// This analyzes the alloca to ensure we can reason about it, builds 2998/// a partitioning of the alloca, and then hands it off to be split and 2999/// rewritten as needed. 3000bool SROA::runOnAlloca(AllocaInst &AI) { 3001 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3002 ++NumAllocasAnalyzed; 3003 3004 // Special case dead allocas, as they're trivial. 3005 if (AI.use_empty()) { 3006 AI.eraseFromParent(); 3007 return true; 3008 } 3009 3010 // Skip alloca forms that this analysis can't handle. 3011 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3012 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3013 return false; 3014 3015 // First check if this is a non-aggregate type that we should simply promote. 3016 if (!AI.getAllocatedType()->isAggregateType() && isAllocaPromotable(&AI)) { 3017 DEBUG(dbgs() << " Trivially scalar type, queuing for promotion...\n"); 3018 PromotableAllocas.push_back(&AI); 3019 return false; 3020 } 3021 3022 bool Changed = false; 3023 3024 // First, split any FCA loads and stores touching this alloca to promote 3025 // better splitting and promotion opportunities. 3026 AggLoadStoreRewriter AggRewriter(*TD); 3027 Changed |= AggRewriter.rewrite(AI); 3028 3029 // Build the partition set using a recursive instruction-visiting builder. 3030 AllocaPartitioning P(*TD, AI); 3031 DEBUG(P.print(dbgs())); 3032 if (P.isEscaped()) 3033 return Changed; 3034 3035 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3036 if (P.begin() == P.end()) 3037 return Changed; 3038 3039 // Delete all the dead users of this alloca before splitting and rewriting it. 3040 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3041 DE = P.dead_user_end(); 3042 DI != DE; ++DI) { 3043 Changed = true; 3044 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3045 DeadInsts.push_back(*DI); 3046 } 3047 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3048 DE = P.dead_op_end(); 3049 DO != DE; ++DO) { 3050 Value *OldV = **DO; 3051 // Clobber the use with an undef value. 3052 **DO = UndefValue::get(OldV->getType()); 3053 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3054 if (isInstructionTriviallyDead(OldI)) { 3055 Changed = true; 3056 DeadInsts.push_back(OldI); 3057 } 3058 } 3059 3060 return splitAlloca(AI, P) || Changed; 3061} 3062 3063/// \brief Delete the dead instructions accumulated in this run. 3064/// 3065/// Recursively deletes the dead instructions we've accumulated. This is done 3066/// at the very end to maximize locality of the recursive delete and to 3067/// minimize the problems of invalidated instruction pointers as such pointers 3068/// are used heavily in the intermediate stages of the algorithm. 3069/// 3070/// We also record the alloca instructions deleted here so that they aren't 3071/// subsequently handed to mem2reg to promote. 3072void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3073 DeadSplitInsts.clear(); 3074 while (!DeadInsts.empty()) { 3075 Instruction *I = DeadInsts.pop_back_val(); 3076 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3077 3078 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3079 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3080 // Zero out the operand and see if it becomes trivially dead. 3081 *OI = 0; 3082 if (isInstructionTriviallyDead(U)) 3083 DeadInsts.push_back(U); 3084 } 3085 3086 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3087 DeletedAllocas.insert(AI); 3088 3089 ++NumDeleted; 3090 I->eraseFromParent(); 3091 } 3092} 3093 3094/// \brief Promote the allocas, using the best available technique. 3095/// 3096/// This attempts to promote whatever allocas have been identified as viable in 3097/// the PromotableAllocas list. If that list is empty, there is nothing to do. 3098/// If there is a domtree available, we attempt to promote using the full power 3099/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3100/// based on the SSAUpdater utilities. This function returns whether any 3101/// promotion occured. 3102bool SROA::promoteAllocas(Function &F) { 3103 if (PromotableAllocas.empty()) 3104 return false; 3105 3106 NumPromoted += PromotableAllocas.size(); 3107 3108 if (DT && !ForceSSAUpdater) { 3109 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3110 PromoteMemToReg(PromotableAllocas, *DT); 3111 PromotableAllocas.clear(); 3112 return true; 3113 } 3114 3115 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3116 SSAUpdater SSA; 3117 DIBuilder DIB(*F.getParent()); 3118 SmallVector<Instruction*, 64> Insts; 3119 3120 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3121 AllocaInst *AI = PromotableAllocas[Idx]; 3122 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3123 UI != UE;) { 3124 Instruction *I = cast<Instruction>(*UI++); 3125 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3126 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3127 // leading to them) here. Eventually it should use them to optimize the 3128 // scalar values produced. 3129 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3130 assert(onlyUsedByLifetimeMarkers(I) && 3131 "Found a bitcast used outside of a lifetime marker."); 3132 while (!I->use_empty()) 3133 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3134 I->eraseFromParent(); 3135 continue; 3136 } 3137 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3138 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3139 II->getIntrinsicID() == Intrinsic::lifetime_end); 3140 II->eraseFromParent(); 3141 continue; 3142 } 3143 3144 Insts.push_back(I); 3145 } 3146 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3147 Insts.clear(); 3148 } 3149 3150 PromotableAllocas.clear(); 3151 return true; 3152} 3153 3154namespace { 3155 /// \brief A predicate to test whether an alloca belongs to a set. 3156 class IsAllocaInSet { 3157 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3158 const SetType &Set; 3159 3160 public: 3161 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3162 bool operator()(AllocaInst *AI) { return Set.count(AI); } 3163 }; 3164} 3165 3166bool SROA::runOnFunction(Function &F) { 3167 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3168 C = &F.getContext(); 3169 TD = getAnalysisIfAvailable<TargetData>(); 3170 if (!TD) { 3171 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3172 return false; 3173 } 3174 DT = getAnalysisIfAvailable<DominatorTree>(); 3175 3176 BasicBlock &EntryBB = F.getEntryBlock(); 3177 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3178 I != E; ++I) 3179 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3180 Worklist.insert(AI); 3181 3182 bool Changed = false; 3183 // A set of deleted alloca instruction pointers which should be removed from 3184 // the list of promotable allocas. 3185 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3186 3187 while (!Worklist.empty()) { 3188 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3189 deleteDeadInstructions(DeletedAllocas); 3190 if (!DeletedAllocas.empty()) { 3191 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3192 PromotableAllocas.end(), 3193 IsAllocaInSet(DeletedAllocas)), 3194 PromotableAllocas.end()); 3195 DeletedAllocas.clear(); 3196 } 3197 } 3198 3199 Changed |= promoteAllocas(F); 3200 3201 return Changed; 3202} 3203 3204void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3205 if (RequiresDomTree) 3206 AU.addRequired<DominatorTree>(); 3207 AU.setPreservesCFG(); 3208} 3209