1//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// This pass merges globals with internal linkage into one. This way all the 10// globals which were merged into a biggest one can be addressed using offsets 11// from the same base pointer (no need for separate base pointer for each of the 12// global). Such a transformation can significantly reduce the register pressure 13// when many globals are involved. 14// 15// For example, consider the code which touches several global variables at 16// once: 17// 18// static int foo[N], bar[N], baz[N]; 19// 20// for (i = 0; i < N; ++i) { 21// foo[i] = bar[i] * baz[i]; 22// } 23// 24// On ARM the addresses of 3 arrays should be kept in the registers, thus 25// this code has quite large register pressure (loop body): 26// 27// ldr r1, [r5], #4 28// ldr r2, [r6], #4 29// mul r1, r2, r1 30// str r1, [r0], #4 31// 32// Pass converts the code to something like: 33// 34// static struct { 35// int foo[N]; 36// int bar[N]; 37// int baz[N]; 38// } merged; 39// 40// for (i = 0; i < N; ++i) { 41// merged.foo[i] = merged.bar[i] * merged.baz[i]; 42// } 43// 44// and in ARM code this becomes: 45// 46// ldr r0, [r5, #40] 47// ldr r1, [r5, #80] 48// mul r0, r1, r0 49// str r0, [r5], #4 50// 51// note that we saved 2 registers here almostly "for free". 52// 53// However, merging globals can have tradeoffs: 54// - it confuses debuggers, tools, and users 55// - it makes linker optimizations less useful (order files, LOHs, ...) 56// - it forces usage of indexed addressing (which isn't necessarily "free") 57// - it can increase register pressure when the uses are disparate enough. 58// 59// We use heuristics to discover the best global grouping we can (cf cl::opts). 60// ===---------------------------------------------------------------------===// 61 62#include "llvm/Transforms/Scalar.h" 63#include "llvm/ADT/DenseMap.h" 64#include "llvm/ADT/SmallBitVector.h" 65#include "llvm/ADT/SmallPtrSet.h" 66#include "llvm/ADT/Statistic.h" 67#include "llvm/CodeGen/Passes.h" 68#include "llvm/IR/Attributes.h" 69#include "llvm/IR/Constants.h" 70#include "llvm/IR/DataLayout.h" 71#include "llvm/IR/DerivedTypes.h" 72#include "llvm/IR/Function.h" 73#include "llvm/IR/GlobalVariable.h" 74#include "llvm/IR/Instructions.h" 75#include "llvm/IR/Intrinsics.h" 76#include "llvm/IR/Module.h" 77#include "llvm/Pass.h" 78#include "llvm/Support/CommandLine.h" 79#include "llvm/Support/Debug.h" 80#include "llvm/Support/raw_ostream.h" 81#include "llvm/Target/TargetLowering.h" 82#include "llvm/Target/TargetLoweringObjectFile.h" 83#include "llvm/Target/TargetSubtargetInfo.h" 84#include <algorithm> 85using namespace llvm; 86 87#define DEBUG_TYPE "global-merge" 88 89// FIXME: This is only useful as a last-resort way to disable the pass. 90cl::opt<bool> 91EnableGlobalMerge("enable-global-merge", cl::Hidden, 92 cl::desc("Enable the global merge pass"), 93 cl::init(true)); 94 95static cl::opt<bool> GlobalMergeGroupByUse( 96 "global-merge-group-by-use", cl::Hidden, 97 cl::desc("Improve global merge pass to look at uses"), cl::init(true)); 98 99static cl::opt<bool> GlobalMergeIgnoreSingleUse( 100 "global-merge-ignore-single-use", cl::Hidden, 101 cl::desc("Improve global merge pass to ignore globals only used alone"), 102 cl::init(true)); 103 104static cl::opt<bool> 105EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, 106 cl::desc("Enable global merge pass on constants"), 107 cl::init(false)); 108 109// FIXME: this could be a transitional option, and we probably need to remove 110// it if only we are sure this optimization could always benefit all targets. 111static cl::opt<cl::boolOrDefault> 112EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden, 113 cl::desc("Enable global merge pass on external linkage")); 114 115STATISTIC(NumMerged, "Number of globals merged"); 116namespace { 117 class GlobalMerge : public FunctionPass { 118 const TargetMachine *TM; 119 // FIXME: Infer the maximum possible offset depending on the actual users 120 // (these max offsets are different for the users inside Thumb or ARM 121 // functions), see the code that passes in the offset in the ARM backend 122 // for more information. 123 unsigned MaxOffset; 124 125 /// Whether we should try to optimize for size only. 126 /// Currently, this applies a dead simple heuristic: only consider globals 127 /// used in minsize functions for merging. 128 /// FIXME: This could learn about optsize, and be used in the cost model. 129 bool OnlyOptimizeForSize; 130 131 /// Whether we should merge global variables that have external linkage. 132 bool MergeExternalGlobals; 133 134 bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, 135 Module &M, bool isConst, unsigned AddrSpace) const; 136 /// \brief Merge everything in \p Globals for which the corresponding bit 137 /// in \p GlobalSet is set. 138 bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals, 139 const BitVector &GlobalSet, Module &M, bool isConst, 140 unsigned AddrSpace) const; 141 142 /// \brief Check if the given variable has been identified as must keep 143 /// \pre setMustKeepGlobalVariables must have been called on the Module that 144 /// contains GV 145 bool isMustKeepGlobalVariable(const GlobalVariable *GV) const { 146 return MustKeepGlobalVariables.count(GV); 147 } 148 149 /// Collect every variables marked as "used" or used in a landing pad 150 /// instruction for this Module. 151 void setMustKeepGlobalVariables(Module &M); 152 153 /// Collect every variables marked as "used" 154 void collectUsedGlobalVariables(Module &M); 155 156 /// Keep track of the GlobalVariable that must not be merged away 157 SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables; 158 159 public: 160 static char ID; // Pass identification, replacement for typeid. 161 explicit GlobalMerge(const TargetMachine *TM = nullptr, 162 unsigned MaximalOffset = 0, 163 bool OnlyOptimizeForSize = false, 164 bool MergeExternalGlobals = false) 165 : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset), 166 OnlyOptimizeForSize(OnlyOptimizeForSize), 167 MergeExternalGlobals(MergeExternalGlobals) { 168 initializeGlobalMergePass(*PassRegistry::getPassRegistry()); 169 } 170 171 bool doInitialization(Module &M) override; 172 bool runOnFunction(Function &F) override; 173 bool doFinalization(Module &M) override; 174 175 const char *getPassName() const override { 176 return "Merge internal globals"; 177 } 178 179 void getAnalysisUsage(AnalysisUsage &AU) const override { 180 AU.setPreservesCFG(); 181 FunctionPass::getAnalysisUsage(AU); 182 } 183 }; 184} // end anonymous namespace 185 186char GlobalMerge::ID = 0; 187INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables", 188 false, false) 189INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables", 190 false, false) 191 192bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, 193 Module &M, bool isConst, unsigned AddrSpace) const { 194 auto &DL = M.getDataLayout(); 195 // FIXME: Find better heuristics 196 std::stable_sort(Globals.begin(), Globals.end(), 197 [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) { 198 return DL.getTypeAllocSize(GV1->getValueType()) < 199 DL.getTypeAllocSize(GV2->getValueType()); 200 }); 201 202 // If we want to just blindly group all globals together, do so. 203 if (!GlobalMergeGroupByUse) { 204 BitVector AllGlobals(Globals.size()); 205 AllGlobals.set(); 206 return doMerge(Globals, AllGlobals, M, isConst, AddrSpace); 207 } 208 209 // If we want to be smarter, look at all uses of each global, to try to 210 // discover all sets of globals used together, and how many times each of 211 // these sets occurred. 212 // 213 // Keep this reasonably efficient, by having an append-only list of all sets 214 // discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of 215 // code (currently, a Function) to the set of globals seen so far that are 216 // used together in that unit (GlobalUsesByFunction). 217 // 218 // When we look at the Nth global, we now that any new set is either: 219 // - the singleton set {N}, containing this global only, or 220 // - the union of {N} and a previously-discovered set, containing some 221 // combination of the previous N-1 globals. 222 // Using that knowledge, when looking at the Nth global, we can keep: 223 // - a reference to the singleton set {N} (CurGVOnlySetIdx) 224 // - a list mapping each previous set to its union with {N} (EncounteredUGS), 225 // if it actually occurs. 226 227 // We keep track of the sets of globals used together "close enough". 228 struct UsedGlobalSet { 229 UsedGlobalSet(size_t Size) : Globals(Size), UsageCount(1) {} 230 BitVector Globals; 231 unsigned UsageCount; 232 }; 233 234 // Each set is unique in UsedGlobalSets. 235 std::vector<UsedGlobalSet> UsedGlobalSets; 236 237 // Avoid repeating the create-global-set pattern. 238 auto CreateGlobalSet = [&]() -> UsedGlobalSet & { 239 UsedGlobalSets.emplace_back(Globals.size()); 240 return UsedGlobalSets.back(); 241 }; 242 243 // The first set is the empty set. 244 CreateGlobalSet().UsageCount = 0; 245 246 // We define "close enough" to be "in the same function". 247 // FIXME: Grouping uses by function is way too aggressive, so we should have 248 // a better metric for distance between uses. 249 // The obvious alternative would be to group by BasicBlock, but that's in 250 // turn too conservative.. 251 // Anything in between wouldn't be trivial to compute, so just stick with 252 // per-function grouping. 253 254 // The value type is an index into UsedGlobalSets. 255 // The default (0) conveniently points to the empty set. 256 DenseMap<Function *, size_t /*UsedGlobalSetIdx*/> GlobalUsesByFunction; 257 258 // Now, look at each merge-eligible global in turn. 259 260 // Keep track of the sets we already encountered to which we added the 261 // current global. 262 // Each element matches the same-index element in UsedGlobalSets. 263 // This lets us efficiently tell whether a set has already been expanded to 264 // include the current global. 265 std::vector<size_t> EncounteredUGS; 266 267 for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) { 268 GlobalVariable *GV = Globals[GI]; 269 270 // Reset the encountered sets for this global... 271 std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0); 272 // ...and grow it in case we created new sets for the previous global. 273 EncounteredUGS.resize(UsedGlobalSets.size()); 274 275 // We might need to create a set that only consists of the current global. 276 // Keep track of its index into UsedGlobalSets. 277 size_t CurGVOnlySetIdx = 0; 278 279 // For each global, look at all its Uses. 280 for (auto &U : GV->uses()) { 281 // This Use might be a ConstantExpr. We're interested in Instruction 282 // users, so look through ConstantExpr... 283 Use *UI, *UE; 284 if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) { 285 if (CE->use_empty()) 286 continue; 287 UI = &*CE->use_begin(); 288 UE = nullptr; 289 } else if (isa<Instruction>(U.getUser())) { 290 UI = &U; 291 UE = UI->getNext(); 292 } else { 293 continue; 294 } 295 296 // ...to iterate on all the instruction users of the global. 297 // Note that we iterate on Uses and not on Users to be able to getNext(). 298 for (; UI != UE; UI = UI->getNext()) { 299 Instruction *I = dyn_cast<Instruction>(UI->getUser()); 300 if (!I) 301 continue; 302 303 Function *ParentFn = I->getParent()->getParent(); 304 305 // If we're only optimizing for size, ignore non-minsize functions. 306 if (OnlyOptimizeForSize && !ParentFn->optForMinSize()) 307 continue; 308 309 size_t UGSIdx = GlobalUsesByFunction[ParentFn]; 310 311 // If this is the first global the basic block uses, map it to the set 312 // consisting of this global only. 313 if (!UGSIdx) { 314 // If that set doesn't exist yet, create it. 315 if (!CurGVOnlySetIdx) { 316 CurGVOnlySetIdx = UsedGlobalSets.size(); 317 CreateGlobalSet().Globals.set(GI); 318 } else { 319 ++UsedGlobalSets[CurGVOnlySetIdx].UsageCount; 320 } 321 322 GlobalUsesByFunction[ParentFn] = CurGVOnlySetIdx; 323 continue; 324 } 325 326 // If we already encountered this BB, just increment the counter. 327 if (UsedGlobalSets[UGSIdx].Globals.test(GI)) { 328 ++UsedGlobalSets[UGSIdx].UsageCount; 329 continue; 330 } 331 332 // If not, the previous set wasn't actually used in this function. 333 --UsedGlobalSets[UGSIdx].UsageCount; 334 335 // If we already expanded the previous set to include this global, just 336 // reuse that expanded set. 337 if (size_t ExpandedIdx = EncounteredUGS[UGSIdx]) { 338 ++UsedGlobalSets[ExpandedIdx].UsageCount; 339 GlobalUsesByFunction[ParentFn] = ExpandedIdx; 340 continue; 341 } 342 343 // If not, create a new set consisting of the union of the previous set 344 // and this global. Mark it as encountered, so we can reuse it later. 345 GlobalUsesByFunction[ParentFn] = EncounteredUGS[UGSIdx] = 346 UsedGlobalSets.size(); 347 348 UsedGlobalSet &NewUGS = CreateGlobalSet(); 349 NewUGS.Globals.set(GI); 350 NewUGS.Globals |= UsedGlobalSets[UGSIdx].Globals; 351 } 352 } 353 } 354 355 // Now we found a bunch of sets of globals used together. We accumulated 356 // the number of times we encountered the sets (i.e., the number of blocks 357 // that use that exact set of globals). 358 // 359 // Multiply that by the size of the set to give us a crude profitability 360 // metric. 361 std::sort(UsedGlobalSets.begin(), UsedGlobalSets.end(), 362 [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) { 363 return UGS1.Globals.count() * UGS1.UsageCount < 364 UGS2.Globals.count() * UGS2.UsageCount; 365 }); 366 367 // We can choose to merge all globals together, but ignore globals never used 368 // with another global. This catches the obviously non-profitable cases of 369 // having a single global, but is aggressive enough for any other case. 370 if (GlobalMergeIgnoreSingleUse) { 371 BitVector AllGlobals(Globals.size()); 372 for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) { 373 const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1]; 374 if (UGS.UsageCount == 0) 375 continue; 376 if (UGS.Globals.count() > 1) 377 AllGlobals |= UGS.Globals; 378 } 379 return doMerge(Globals, AllGlobals, M, isConst, AddrSpace); 380 } 381 382 // Starting from the sets with the best (=biggest) profitability, find a 383 // good combination. 384 // The ideal (and expensive) solution can only be found by trying all 385 // combinations, looking for the one with the best profitability. 386 // Don't be smart about it, and just pick the first compatible combination, 387 // starting with the sets with the best profitability. 388 BitVector PickedGlobals(Globals.size()); 389 bool Changed = false; 390 391 for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) { 392 const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1]; 393 if (UGS.UsageCount == 0) 394 continue; 395 if (PickedGlobals.anyCommon(UGS.Globals)) 396 continue; 397 PickedGlobals |= UGS.Globals; 398 // If the set only contains one global, there's no point in merging. 399 // Ignore the global for inclusion in other sets though, so keep it in 400 // PickedGlobals. 401 if (UGS.Globals.count() < 2) 402 continue; 403 Changed |= doMerge(Globals, UGS.Globals, M, isConst, AddrSpace); 404 } 405 406 return Changed; 407} 408 409bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals, 410 const BitVector &GlobalSet, Module &M, bool isConst, 411 unsigned AddrSpace) const { 412 assert(Globals.size() > 1); 413 414 Type *Int32Ty = Type::getInt32Ty(M.getContext()); 415 auto &DL = M.getDataLayout(); 416 417 DEBUG(dbgs() << " Trying to merge set, starts with #" 418 << GlobalSet.find_first() << "\n"); 419 420 ssize_t i = GlobalSet.find_first(); 421 while (i != -1) { 422 ssize_t j = 0; 423 uint64_t MergedSize = 0; 424 std::vector<Type*> Tys; 425 std::vector<Constant*> Inits; 426 427 for (j = i; j != -1; j = GlobalSet.find_next(j)) { 428 Type *Ty = Globals[j]->getValueType(); 429 MergedSize += DL.getTypeAllocSize(Ty); 430 if (MergedSize > MaxOffset) { 431 break; 432 } 433 Tys.push_back(Ty); 434 Inits.push_back(Globals[j]->getInitializer()); 435 } 436 437 StructType *MergedTy = StructType::get(M.getContext(), Tys); 438 Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); 439 440 GlobalVariable *MergedGV = new GlobalVariable( 441 M, MergedTy, isConst, GlobalValue::PrivateLinkage, MergedInit, 442 "_MergedGlobals", nullptr, GlobalVariable::NotThreadLocal, AddrSpace); 443 444 for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) { 445 GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage(); 446 std::string Name = Globals[k]->getName(); 447 448 Constant *Idx[2] = { 449 ConstantInt::get(Int32Ty, 0), 450 ConstantInt::get(Int32Ty, idx), 451 }; 452 Constant *GEP = 453 ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx); 454 Globals[k]->replaceAllUsesWith(GEP); 455 Globals[k]->eraseFromParent(); 456 457 // When the linkage is not internal we must emit an alias for the original 458 // variable name as it may be accessed from another object. On non-Mach-O 459 // we can also emit an alias for internal linkage as it's safe to do so. 460 // It's not safe on Mach-O as the alias (and thus the portion of the 461 // MergedGlobals variable) may be dead stripped at link time. 462 if (Linkage != GlobalValue::InternalLinkage || 463 !TM->getTargetTriple().isOSBinFormatMachO()) { 464 GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M); 465 } 466 467 NumMerged++; 468 } 469 i = j; 470 } 471 472 return true; 473} 474 475void GlobalMerge::collectUsedGlobalVariables(Module &M) { 476 // Extract global variables from llvm.used array 477 const GlobalVariable *GV = M.getGlobalVariable("llvm.used"); 478 if (!GV || !GV->hasInitializer()) return; 479 480 // Should be an array of 'i8*'. 481 const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer()); 482 483 for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) 484 if (const GlobalVariable *G = 485 dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts())) 486 MustKeepGlobalVariables.insert(G); 487} 488 489void GlobalMerge::setMustKeepGlobalVariables(Module &M) { 490 collectUsedGlobalVariables(M); 491 492 for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn; 493 ++IFn) { 494 for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end(); 495 IBB != IEndBB; ++IBB) { 496 // Follow the invoke link to find the landing pad instruction 497 const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator()); 498 if (!II) continue; 499 500 const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst(); 501 // Look for globals in the clauses of the landing pad instruction 502 for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses(); 503 Idx != NumClauses; ++Idx) 504 if (const GlobalVariable *GV = 505 dyn_cast<GlobalVariable>(LPInst->getClause(Idx) 506 ->stripPointerCasts())) 507 MustKeepGlobalVariables.insert(GV); 508 } 509 } 510} 511 512bool GlobalMerge::doInitialization(Module &M) { 513 if (!EnableGlobalMerge) 514 return false; 515 516 auto &DL = M.getDataLayout(); 517 DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, 518 BSSGlobals; 519 bool Changed = false; 520 setMustKeepGlobalVariables(M); 521 522 // Grab all non-const globals. 523 for (auto &GV : M.globals()) { 524 // Merge is safe for "normal" internal or external globals only 525 if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection()) 526 continue; 527 528 if (!(MergeExternalGlobals && GV.hasExternalLinkage()) && 529 !GV.hasInternalLinkage()) 530 continue; 531 532 PointerType *PT = dyn_cast<PointerType>(GV.getType()); 533 assert(PT && "Global variable is not a pointer!"); 534 535 unsigned AddressSpace = PT->getAddressSpace(); 536 537 // Ignore fancy-aligned globals for now. 538 unsigned Alignment = DL.getPreferredAlignment(&GV); 539 Type *Ty = GV.getValueType(); 540 if (Alignment > DL.getABITypeAlignment(Ty)) 541 continue; 542 543 // Ignore all 'special' globals. 544 if (GV.getName().startswith("llvm.") || 545 GV.getName().startswith(".llvm.")) 546 continue; 547 548 // Ignore all "required" globals: 549 if (isMustKeepGlobalVariable(&GV)) 550 continue; 551 552 if (DL.getTypeAllocSize(Ty) < MaxOffset) { 553 if (TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal()) 554 BSSGlobals[AddressSpace].push_back(&GV); 555 else if (GV.isConstant()) 556 ConstGlobals[AddressSpace].push_back(&GV); 557 else 558 Globals[AddressSpace].push_back(&GV); 559 } 560 } 561 562 for (auto &P : Globals) 563 if (P.second.size() > 1) 564 Changed |= doMerge(P.second, M, false, P.first); 565 566 for (auto &P : BSSGlobals) 567 if (P.second.size() > 1) 568 Changed |= doMerge(P.second, M, false, P.first); 569 570 if (EnableGlobalMergeOnConst) 571 for (auto &P : ConstGlobals) 572 if (P.second.size() > 1) 573 Changed |= doMerge(P.second, M, true, P.first); 574 575 return Changed; 576} 577 578bool GlobalMerge::runOnFunction(Function &F) { 579 return false; 580} 581 582bool GlobalMerge::doFinalization(Module &M) { 583 MustKeepGlobalVariables.clear(); 584 return false; 585} 586 587Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset, 588 bool OnlyOptimizeForSize, 589 bool MergeExternalByDefault) { 590 bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ? 591 MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE); 592 return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal); 593} 594