BlockFrequencyInfoImpl.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// Loops should be simplified before this analysis. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/Analysis/BlockFrequencyInfoImpl.h" 15#include "llvm/ADT/APFloat.h" 16#include "llvm/ADT/SCCIterator.h" 17#include "llvm/Support/raw_ostream.h" 18#include <deque> 19 20using namespace llvm; 21using namespace llvm::bfi_detail; 22 23#define DEBUG_TYPE "block-freq" 24 25//===----------------------------------------------------------------------===// 26// 27// UnsignedFloat implementation. 28// 29//===----------------------------------------------------------------------===// 30#ifndef _MSC_VER 31const int32_t UnsignedFloatBase::MaxExponent; 32const int32_t UnsignedFloatBase::MinExponent; 33#endif 34 35static void appendDigit(std::string &Str, unsigned D) { 36 assert(D < 10); 37 Str += '0' + D % 10; 38} 39 40static void appendNumber(std::string &Str, uint64_t N) { 41 while (N) { 42 appendDigit(Str, N % 10); 43 N /= 10; 44 } 45} 46 47static bool doesRoundUp(char Digit) { 48 switch (Digit) { 49 case '5': 50 case '6': 51 case '7': 52 case '8': 53 case '9': 54 return true; 55 default: 56 return false; 57 } 58} 59 60static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) { 61 assert(E >= UnsignedFloatBase::MinExponent); 62 assert(E <= UnsignedFloatBase::MaxExponent); 63 64 // Find a new E, but don't let it increase past MaxExponent. 65 int LeadingZeros = UnsignedFloatBase::countLeadingZeros64(D); 66 int NewE = std::min(UnsignedFloatBase::MaxExponent, E + 63 - LeadingZeros); 67 int Shift = 63 - (NewE - E); 68 assert(Shift <= LeadingZeros); 69 assert(Shift == LeadingZeros || NewE == UnsignedFloatBase::MaxExponent); 70 D <<= Shift; 71 E = NewE; 72 73 // Check for a denormal. 74 unsigned AdjustedE = E + 16383; 75 if (!(D >> 63)) { 76 assert(E == UnsignedFloatBase::MaxExponent); 77 AdjustedE = 0; 78 } 79 80 // Build the float and print it. 81 uint64_t RawBits[2] = {D, AdjustedE}; 82 APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits)); 83 SmallVector<char, 24> Chars; 84 Float.toString(Chars, Precision, 0); 85 return std::string(Chars.begin(), Chars.end()); 86} 87 88static std::string stripTrailingZeros(const std::string &Float) { 89 size_t NonZero = Float.find_last_not_of('0'); 90 assert(NonZero != std::string::npos && "no . in floating point string"); 91 92 if (Float[NonZero] == '.') 93 ++NonZero; 94 95 return Float.substr(0, NonZero + 1); 96} 97 98std::string UnsignedFloatBase::toString(uint64_t D, int16_t E, int Width, 99 unsigned Precision) { 100 if (!D) 101 return "0.0"; 102 103 // Canonicalize exponent and digits. 104 uint64_t Above0 = 0; 105 uint64_t Below0 = 0; 106 uint64_t Extra = 0; 107 int ExtraShift = 0; 108 if (E == 0) { 109 Above0 = D; 110 } else if (E > 0) { 111 if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) { 112 D <<= Shift; 113 E -= Shift; 114 115 if (!E) 116 Above0 = D; 117 } 118 } else if (E > -64) { 119 Above0 = D >> -E; 120 Below0 = D << (64 + E); 121 } else if (E > -120) { 122 Below0 = D >> (-E - 64); 123 Extra = D << (128 + E); 124 ExtraShift = -64 - E; 125 } 126 127 // Fall back on APFloat for very small and very large numbers. 128 if (!Above0 && !Below0) 129 return toStringAPFloat(D, E, Precision); 130 131 // Append the digits before the decimal. 132 std::string Str; 133 size_t DigitsOut = 0; 134 if (Above0) { 135 appendNumber(Str, Above0); 136 DigitsOut = Str.size(); 137 } else 138 appendDigit(Str, 0); 139 std::reverse(Str.begin(), Str.end()); 140 141 // Return early if there's nothing after the decimal. 142 if (!Below0) 143 return Str + ".0"; 144 145 // Append the decimal and beyond. 146 Str += '.'; 147 uint64_t Error = UINT64_C(1) << (64 - Width); 148 149 // We need to shift Below0 to the right to make space for calculating 150 // digits. Save the precision we're losing in Extra. 151 Extra = (Below0 & 0xf) << 56 | (Extra >> 8); 152 Below0 >>= 4; 153 size_t SinceDot = 0; 154 size_t AfterDot = Str.size(); 155 do { 156 if (ExtraShift) { 157 --ExtraShift; 158 Error *= 5; 159 } else 160 Error *= 10; 161 162 Below0 *= 10; 163 Extra *= 10; 164 Below0 += (Extra >> 60); 165 Extra = Extra & (UINT64_MAX >> 4); 166 appendDigit(Str, Below0 >> 60); 167 Below0 = Below0 & (UINT64_MAX >> 4); 168 if (DigitsOut || Str.back() != '0') 169 ++DigitsOut; 170 ++SinceDot; 171 } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 && 172 (!Precision || DigitsOut <= Precision || SinceDot < 2)); 173 174 // Return early for maximum precision. 175 if (!Precision || DigitsOut <= Precision) 176 return stripTrailingZeros(Str); 177 178 // Find where to truncate. 179 size_t Truncate = 180 std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1); 181 182 // Check if there's anything to truncate. 183 if (Truncate >= Str.size()) 184 return stripTrailingZeros(Str); 185 186 bool Carry = doesRoundUp(Str[Truncate]); 187 if (!Carry) 188 return stripTrailingZeros(Str.substr(0, Truncate)); 189 190 // Round with the first truncated digit. 191 for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend(); 192 I != E; ++I) { 193 if (*I == '.') 194 continue; 195 if (*I == '9') { 196 *I = '0'; 197 continue; 198 } 199 200 ++*I; 201 Carry = false; 202 break; 203 } 204 205 // Add "1" in front if we still need to carry. 206 return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate)); 207} 208 209raw_ostream &UnsignedFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E, 210 int Width, unsigned Precision) { 211 return OS << toString(D, E, Width, Precision); 212} 213 214void UnsignedFloatBase::dump(uint64_t D, int16_t E, int Width) { 215 print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E 216 << "]"; 217} 218 219static std::pair<uint64_t, int16_t> 220getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) { 221 if (ShouldRound) 222 if (!++N) 223 // Rounding caused an overflow. 224 return std::make_pair(UINT64_C(1), Shift + 64); 225 return std::make_pair(N, Shift); 226} 227 228std::pair<uint64_t, int16_t> UnsignedFloatBase::divide64(uint64_t Dividend, 229 uint64_t Divisor) { 230 // Input should be sanitized. 231 assert(Divisor); 232 assert(Dividend); 233 234 // Minimize size of divisor. 235 int16_t Shift = 0; 236 if (int Zeros = countTrailingZeros(Divisor)) { 237 Shift -= Zeros; 238 Divisor >>= Zeros; 239 } 240 241 // Check for powers of two. 242 if (Divisor == 1) 243 return std::make_pair(Dividend, Shift); 244 245 // Maximize size of dividend. 246 if (int Zeros = countLeadingZeros64(Dividend)) { 247 Shift -= Zeros; 248 Dividend <<= Zeros; 249 } 250 251 // Start with the result of a divide. 252 uint64_t Quotient = Dividend / Divisor; 253 Dividend %= Divisor; 254 255 // Continue building the quotient with long division. 256 // 257 // TODO: continue with largers digits. 258 while (!(Quotient >> 63) && Dividend) { 259 // Shift Dividend, and check for overflow. 260 bool IsOverflow = Dividend >> 63; 261 Dividend <<= 1; 262 --Shift; 263 264 // Divide. 265 bool DoesDivide = IsOverflow || Divisor <= Dividend; 266 Quotient = (Quotient << 1) | uint64_t(DoesDivide); 267 Dividend -= DoesDivide ? Divisor : 0; 268 } 269 270 // Round. 271 if (Dividend >= getHalf(Divisor)) 272 if (!++Quotient) 273 // Rounding caused an overflow in Quotient. 274 return std::make_pair(UINT64_C(1), Shift + 64); 275 276 return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift); 277} 278 279std::pair<uint64_t, int16_t> UnsignedFloatBase::multiply64(uint64_t L, 280 uint64_t R) { 281 // Separate into two 32-bit digits (U.L). 282 uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX; 283 284 // Compute cross products. 285 uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR; 286 287 // Sum into two 64-bit digits. 288 uint64_t Upper = P1, Lower = P4; 289 auto addWithCarry = [&](uint64_t N) { 290 uint64_t NewLower = Lower + (N << 32); 291 Upper += (N >> 32) + (NewLower < Lower); 292 Lower = NewLower; 293 }; 294 addWithCarry(P2); 295 addWithCarry(P3); 296 297 // Check whether the upper digit is empty. 298 if (!Upper) 299 return std::make_pair(Lower, 0); 300 301 // Shift as little as possible to maximize precision. 302 unsigned LeadingZeros = countLeadingZeros64(Upper); 303 int16_t Shift = 64 - LeadingZeros; 304 if (LeadingZeros) 305 Upper = Upper << LeadingZeros | Lower >> Shift; 306 bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1)); 307 return getRoundedFloat(Upper, ShouldRound, Shift); 308} 309 310//===----------------------------------------------------------------------===// 311// 312// BlockMass implementation. 313// 314//===----------------------------------------------------------------------===// 315UnsignedFloat<uint64_t> BlockMass::toFloat() const { 316 if (isFull()) 317 return UnsignedFloat<uint64_t>(1, 0); 318 return UnsignedFloat<uint64_t>(getMass() + 1, -64); 319} 320 321void BlockMass::dump() const { print(dbgs()); } 322 323static char getHexDigit(int N) { 324 assert(N < 16); 325 if (N < 10) 326 return '0' + N; 327 return 'a' + N - 10; 328} 329raw_ostream &BlockMass::print(raw_ostream &OS) const { 330 for (int Digits = 0; Digits < 16; ++Digits) 331 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); 332 return OS; 333} 334 335//===----------------------------------------------------------------------===// 336// 337// BlockFrequencyInfoImpl implementation. 338// 339//===----------------------------------------------------------------------===// 340namespace { 341 342typedef BlockFrequencyInfoImplBase::BlockNode BlockNode; 343typedef BlockFrequencyInfoImplBase::Distribution Distribution; 344typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList; 345typedef BlockFrequencyInfoImplBase::Float Float; 346typedef BlockFrequencyInfoImplBase::LoopData LoopData; 347typedef BlockFrequencyInfoImplBase::Weight Weight; 348typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData; 349 350/// \brief Dithering mass distributer. 351/// 352/// This class splits up a single mass into portions by weight, dithering to 353/// spread out error. No mass is lost. The dithering precision depends on the 354/// precision of the product of \a BlockMass and \a BranchProbability. 355/// 356/// The distribution algorithm follows. 357/// 358/// 1. Initialize by saving the sum of the weights in \a RemWeight and the 359/// mass to distribute in \a RemMass. 360/// 361/// 2. For each portion: 362/// 363/// 1. Construct a branch probability, P, as the portion's weight divided 364/// by the current value of \a RemWeight. 365/// 2. Calculate the portion's mass as \a RemMass times P. 366/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting 367/// the current portion's weight and mass. 368struct DitheringDistributer { 369 uint32_t RemWeight; 370 BlockMass RemMass; 371 372 DitheringDistributer(Distribution &Dist, const BlockMass &Mass); 373 374 BlockMass takeMass(uint32_t Weight); 375}; 376} 377 378DitheringDistributer::DitheringDistributer(Distribution &Dist, 379 const BlockMass &Mass) { 380 Dist.normalize(); 381 RemWeight = Dist.Total; 382 RemMass = Mass; 383} 384 385BlockMass DitheringDistributer::takeMass(uint32_t Weight) { 386 assert(Weight && "invalid weight"); 387 assert(Weight <= RemWeight); 388 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); 389 390 // Decrement totals (dither). 391 RemWeight -= Weight; 392 RemMass -= Mass; 393 return Mass; 394} 395 396void Distribution::add(const BlockNode &Node, uint64_t Amount, 397 Weight::DistType Type) { 398 assert(Amount && "invalid weight of 0"); 399 uint64_t NewTotal = Total + Amount; 400 401 // Check for overflow. It should be impossible to overflow twice. 402 bool IsOverflow = NewTotal < Total; 403 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); 404 DidOverflow |= IsOverflow; 405 406 // Update the total. 407 Total = NewTotal; 408 409 // Save the weight. 410 Weight W; 411 W.TargetNode = Node; 412 W.Amount = Amount; 413 W.Type = Type; 414 Weights.push_back(W); 415} 416 417static void combineWeight(Weight &W, const Weight &OtherW) { 418 assert(OtherW.TargetNode.isValid()); 419 if (!W.Amount) { 420 W = OtherW; 421 return; 422 } 423 assert(W.Type == OtherW.Type); 424 assert(W.TargetNode == OtherW.TargetNode); 425 assert(W.Amount < W.Amount + OtherW.Amount && "Unexpected overflow"); 426 W.Amount += OtherW.Amount; 427} 428static void combineWeightsBySorting(WeightList &Weights) { 429 // Sort so edges to the same node are adjacent. 430 std::sort(Weights.begin(), Weights.end(), 431 [](const Weight &L, 432 const Weight &R) { return L.TargetNode < R.TargetNode; }); 433 434 // Combine adjacent edges. 435 WeightList::iterator O = Weights.begin(); 436 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; 437 ++O, (I = L)) { 438 *O = *I; 439 440 // Find the adjacent weights to the same node. 441 for (++L; L != E && I->TargetNode == L->TargetNode; ++L) 442 combineWeight(*O, *L); 443 } 444 445 // Erase extra entries. 446 Weights.erase(O, Weights.end()); 447 return; 448} 449static void combineWeightsByHashing(WeightList &Weights) { 450 // Collect weights into a DenseMap. 451 typedef DenseMap<BlockNode::IndexType, Weight> HashTable; 452 HashTable Combined(NextPowerOf2(2 * Weights.size())); 453 for (const Weight &W : Weights) 454 combineWeight(Combined[W.TargetNode.Index], W); 455 456 // Check whether anything changed. 457 if (Weights.size() == Combined.size()) 458 return; 459 460 // Fill in the new weights. 461 Weights.clear(); 462 Weights.reserve(Combined.size()); 463 for (const auto &I : Combined) 464 Weights.push_back(I.second); 465} 466static void combineWeights(WeightList &Weights) { 467 // Use a hash table for many successors to keep this linear. 468 if (Weights.size() > 128) { 469 combineWeightsByHashing(Weights); 470 return; 471 } 472 473 combineWeightsBySorting(Weights); 474} 475static uint64_t shiftRightAndRound(uint64_t N, int Shift) { 476 assert(Shift >= 0); 477 assert(Shift < 64); 478 if (!Shift) 479 return N; 480 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); 481} 482void Distribution::normalize() { 483 // Early exit for termination nodes. 484 if (Weights.empty()) 485 return; 486 487 // Only bother if there are multiple successors. 488 if (Weights.size() > 1) 489 combineWeights(Weights); 490 491 // Early exit when combined into a single successor. 492 if (Weights.size() == 1) { 493 Total = 1; 494 Weights.front().Amount = 1; 495 return; 496 } 497 498 // Determine how much to shift right so that the total fits into 32-bits. 499 // 500 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 501 // for each weight can cause a 32-bit overflow. 502 int Shift = 0; 503 if (DidOverflow) 504 Shift = 33; 505 else if (Total > UINT32_MAX) 506 Shift = 33 - countLeadingZeros(Total); 507 508 // Early exit if nothing needs to be scaled. 509 if (!Shift) 510 return; 511 512 // Recompute the total through accumulation (rather than shifting it) so that 513 // it's accurate after shifting. 514 Total = 0; 515 516 // Sum the weights to each node and shift right if necessary. 517 for (Weight &W : Weights) { 518 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we 519 // can round here without concern about overflow. 520 assert(W.TargetNode.isValid()); 521 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); 522 assert(W.Amount <= UINT32_MAX); 523 524 // Update the total. 525 Total += W.Amount; 526 } 527 assert(Total <= UINT32_MAX); 528} 529 530void BlockFrequencyInfoImplBase::clear() { 531 // Swap with a default-constructed std::vector, since std::vector<>::clear() 532 // does not actually clear heap storage. 533 std::vector<FrequencyData>().swap(Freqs); 534 std::vector<WorkingData>().swap(Working); 535 Loops.clear(); 536} 537 538/// \brief Clear all memory not needed downstream. 539/// 540/// Releases all memory not used downstream. In particular, saves Freqs. 541static void cleanup(BlockFrequencyInfoImplBase &BFI) { 542 std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); 543 BFI.clear(); 544 BFI.Freqs = std::move(SavedFreqs); 545} 546 547bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, 548 const LoopData *OuterLoop, 549 const BlockNode &Pred, 550 const BlockNode &Succ, 551 uint64_t Weight) { 552 if (!Weight) 553 Weight = 1; 554 555 auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { 556 return OuterLoop && OuterLoop->isHeader(Node); 557 }; 558 559 BlockNode Resolved = Working[Succ.Index].getResolvedNode(); 560 561#ifndef NDEBUG 562 auto debugSuccessor = [&](const char *Type) { 563 dbgs() << " =>" 564 << " [" << Type << "] weight = " << Weight; 565 if (!isLoopHeader(Resolved)) 566 dbgs() << ", succ = " << getBlockName(Succ); 567 if (Resolved != Succ) 568 dbgs() << ", resolved = " << getBlockName(Resolved); 569 dbgs() << "\n"; 570 }; 571 (void)debugSuccessor; 572#endif 573 574 if (isLoopHeader(Resolved)) { 575 DEBUG(debugSuccessor("backedge")); 576 Dist.addBackedge(OuterLoop->getHeader(), Weight); 577 return true; 578 } 579 580 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { 581 DEBUG(debugSuccessor(" exit ")); 582 Dist.addExit(Resolved, Weight); 583 return true; 584 } 585 586 if (Resolved < Pred) { 587 if (!isLoopHeader(Pred)) { 588 // If OuterLoop is an irreducible loop, we can't actually handle this. 589 assert((!OuterLoop || !OuterLoop->isIrreducible()) && 590 "unhandled irreducible control flow"); 591 592 // Irreducible backedge. Abort. 593 DEBUG(debugSuccessor("abort!!!")); 594 return false; 595 } 596 597 // If "Pred" is a loop header, then this isn't really a backedge; rather, 598 // OuterLoop must be irreducible. These false backedges can come only from 599 // secondary loop headers. 600 assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && 601 "unhandled irreducible control flow"); 602 } 603 604 DEBUG(debugSuccessor(" local ")); 605 Dist.addLocal(Resolved, Weight); 606 return true; 607} 608 609bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( 610 const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { 611 // Copy the exit map into Dist. 612 for (const auto &I : Loop.Exits) 613 if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, 614 I.second.getMass())) 615 // Irreducible backedge. 616 return false; 617 618 return true; 619} 620 621/// \brief Get the maximum allowed loop scale. 622/// 623/// Gives the maximum number of estimated iterations allowed for a loop. Very 624/// large numbers cause problems downstream (even within 64-bits). 625static Float getMaxLoopScale() { return Float(1, 12); } 626 627/// \brief Compute the loop scale for a loop. 628void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { 629 // Compute loop scale. 630 DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); 631 632 // LoopScale == 1 / ExitMass 633 // ExitMass == HeadMass - BackedgeMass 634 BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; 635 636 // Block scale stores the inverse of the scale. 637 Loop.Scale = ExitMass.toFloat().inverse(); 638 639 DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() 640 << " - " << Loop.BackedgeMass << ")\n" 641 << " - scale = " << Loop.Scale << "\n"); 642 643 if (Loop.Scale > getMaxLoopScale()) { 644 Loop.Scale = getMaxLoopScale(); 645 DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n"); 646 } 647} 648 649/// \brief Package up a loop. 650void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { 651 DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); 652 653 // Clear the subloop exits to prevent quadratic memory usage. 654 for (const BlockNode &M : Loop.Nodes) { 655 if (auto *Loop = Working[M.Index].getPackagedLoop()) 656 Loop->Exits.clear(); 657 DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); 658 } 659 Loop.IsPackaged = true; 660} 661 662void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, 663 LoopData *OuterLoop, 664 Distribution &Dist) { 665 BlockMass Mass = Working[Source.Index].getMass(); 666 DEBUG(dbgs() << " => mass: " << Mass << "\n"); 667 668 // Distribute mass to successors as laid out in Dist. 669 DitheringDistributer D(Dist, Mass); 670 671#ifndef NDEBUG 672 auto debugAssign = [&](const BlockNode &T, const BlockMass &M, 673 const char *Desc) { 674 dbgs() << " => assign " << M << " (" << D.RemMass << ")"; 675 if (Desc) 676 dbgs() << " [" << Desc << "]"; 677 if (T.isValid()) 678 dbgs() << " to " << getBlockName(T); 679 dbgs() << "\n"; 680 }; 681 (void)debugAssign; 682#endif 683 684 for (const Weight &W : Dist.Weights) { 685 // Check for a local edge (non-backedge and non-exit). 686 BlockMass Taken = D.takeMass(W.Amount); 687 if (W.Type == Weight::Local) { 688 Working[W.TargetNode.Index].getMass() += Taken; 689 DEBUG(debugAssign(W.TargetNode, Taken, nullptr)); 690 continue; 691 } 692 693 // Backedges and exits only make sense if we're processing a loop. 694 assert(OuterLoop && "backedge or exit outside of loop"); 695 696 // Check for a backedge. 697 if (W.Type == Weight::Backedge) { 698 OuterLoop->BackedgeMass += Taken; 699 DEBUG(debugAssign(BlockNode(), Taken, "back")); 700 continue; 701 } 702 703 // This must be an exit. 704 assert(W.Type == Weight::Exit); 705 OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); 706 DEBUG(debugAssign(W.TargetNode, Taken, "exit")); 707 } 708} 709 710static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, 711 const Float &Min, const Float &Max) { 712 // Scale the Factor to a size that creates integers. Ideally, integers would 713 // be scaled so that Max == UINT64_MAX so that they can be best 714 // differentiated. However, the register allocator currently deals poorly 715 // with large numbers. Instead, push Min up a little from 1 to give some 716 // room to differentiate small, unequal numbers. 717 // 718 // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max. 719 Float ScalingFactor = Min.inverse(); 720 if ((Max / Min).lg() < 60) 721 ScalingFactor <<= 3; 722 723 // Translate the floats to integers. 724 DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max 725 << ", factor = " << ScalingFactor << "\n"); 726 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { 727 Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor; 728 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); 729 DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " 730 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled 731 << ", int = " << BFI.Freqs[Index].Integer << "\n"); 732 } 733} 734 735/// \brief Unwrap a loop package. 736/// 737/// Visits all the members of a loop, adjusting their BlockData according to 738/// the loop's pseudo-node. 739static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { 740 DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) 741 << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale 742 << "\n"); 743 Loop.Scale *= Loop.Mass.toFloat(); 744 Loop.IsPackaged = false; 745 DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); 746 747 // Propagate the head scale through the loop. Since members are visited in 748 // RPO, the head scale will be updated by the loop scale first, and then the 749 // final head scale will be used for updated the rest of the members. 750 for (const BlockNode &N : Loop.Nodes) { 751 const auto &Working = BFI.Working[N.Index]; 752 Float &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale 753 : BFI.Freqs[N.Index].Floating; 754 Float New = Loop.Scale * F; 755 DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New 756 << "\n"); 757 F = New; 758 } 759} 760 761void BlockFrequencyInfoImplBase::unwrapLoops() { 762 // Set initial frequencies from loop-local masses. 763 for (size_t Index = 0; Index < Working.size(); ++Index) 764 Freqs[Index].Floating = Working[Index].Mass.toFloat(); 765 766 for (LoopData &Loop : Loops) 767 unwrapLoop(*this, Loop); 768} 769 770void BlockFrequencyInfoImplBase::finalizeMetrics() { 771 // Unwrap loop packages in reverse post-order, tracking min and max 772 // frequencies. 773 auto Min = Float::getLargest(); 774 auto Max = Float::getZero(); 775 for (size_t Index = 0; Index < Working.size(); ++Index) { 776 // Update min/max scale. 777 Min = std::min(Min, Freqs[Index].Floating); 778 Max = std::max(Max, Freqs[Index].Floating); 779 } 780 781 // Convert to integers. 782 convertFloatingToInteger(*this, Min, Max); 783 784 // Clean up data structures. 785 cleanup(*this); 786 787 // Print out the final stats. 788 DEBUG(dump()); 789} 790 791BlockFrequency 792BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { 793 if (!Node.isValid()) 794 return 0; 795 return Freqs[Node.Index].Integer; 796} 797Float 798BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { 799 if (!Node.isValid()) 800 return Float::getZero(); 801 return Freqs[Node.Index].Floating; 802} 803 804std::string 805BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { 806 return std::string(); 807} 808std::string 809BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { 810 return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); 811} 812 813raw_ostream & 814BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 815 const BlockNode &Node) const { 816 return OS << getFloatingBlockFreq(Node); 817} 818 819raw_ostream & 820BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 821 const BlockFrequency &Freq) const { 822 Float Block(Freq.getFrequency(), 0); 823 Float Entry(getEntryFreq(), 0); 824 825 return OS << Block / Entry; 826} 827 828void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { 829 Start = OuterLoop.getHeader(); 830 Nodes.reserve(OuterLoop.Nodes.size()); 831 for (auto N : OuterLoop.Nodes) 832 addNode(N); 833 indexNodes(); 834} 835void IrreducibleGraph::addNodesInFunction() { 836 Start = 0; 837 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 838 if (!BFI.Working[Index].isPackaged()) 839 addNode(Index); 840 indexNodes(); 841} 842void IrreducibleGraph::indexNodes() { 843 for (auto &I : Nodes) 844 Lookup[I.Node.Index] = &I; 845} 846void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, 847 const BFIBase::LoopData *OuterLoop) { 848 if (OuterLoop && OuterLoop->isHeader(Succ)) 849 return; 850 auto L = Lookup.find(Succ.Index); 851 if (L == Lookup.end()) 852 return; 853 IrrNode &SuccIrr = *L->second; 854 Irr.Edges.push_back(&SuccIrr); 855 SuccIrr.Edges.push_front(&Irr); 856 ++SuccIrr.NumIn; 857} 858 859namespace llvm { 860template <> struct GraphTraits<IrreducibleGraph> { 861 typedef bfi_detail::IrreducibleGraph GraphT; 862 863 typedef const GraphT::IrrNode NodeType; 864 typedef GraphT::IrrNode::iterator ChildIteratorType; 865 866 static const NodeType *getEntryNode(const GraphT &G) { 867 return G.StartIrr; 868 } 869 static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); } 870 static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); } 871}; 872} 873 874/// \brief Find extra irreducible headers. 875/// 876/// Find entry blocks and other blocks with backedges, which exist when \c G 877/// contains irreducible sub-SCCs. 878static void findIrreducibleHeaders( 879 const BlockFrequencyInfoImplBase &BFI, 880 const IrreducibleGraph &G, 881 const std::vector<const IrreducibleGraph::IrrNode *> &SCC, 882 LoopData::NodeList &Headers, LoopData::NodeList &Others) { 883 // Map from nodes in the SCC to whether it's an entry block. 884 SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; 885 886 // InSCC also acts the set of nodes in the graph. Seed it. 887 for (const auto *I : SCC) 888 InSCC[I] = false; 889 890 for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { 891 auto &Irr = *I->first; 892 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 893 if (InSCC.count(P)) 894 continue; 895 896 // This is an entry block. 897 I->second = true; 898 Headers.push_back(Irr.Node); 899 DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) << "\n"); 900 break; 901 } 902 } 903 assert(Headers.size() >= 2 && "Should be irreducible"); 904 if (Headers.size() == InSCC.size()) { 905 // Every block is a header. 906 std::sort(Headers.begin(), Headers.end()); 907 return; 908 } 909 910 // Look for extra headers from irreducible sub-SCCs. 911 for (const auto &I : InSCC) { 912 // Entry blocks are already headers. 913 if (I.second) 914 continue; 915 916 auto &Irr = *I.first; 917 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 918 // Skip forward edges. 919 if (P->Node < Irr.Node) 920 continue; 921 922 // Skip predecessors from entry blocks. These can have inverted 923 // ordering. 924 if (InSCC.lookup(P)) 925 continue; 926 927 // Store the extra header. 928 Headers.push_back(Irr.Node); 929 DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) << "\n"); 930 break; 931 } 932 if (Headers.back() == Irr.Node) 933 // Added this as a header. 934 continue; 935 936 // This is not a header. 937 Others.push_back(Irr.Node); 938 DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); 939 } 940 std::sort(Headers.begin(), Headers.end()); 941 std::sort(Others.begin(), Others.end()); 942} 943 944static void createIrreducibleLoop( 945 BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, 946 LoopData *OuterLoop, std::list<LoopData>::iterator Insert, 947 const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { 948 // Translate the SCC into RPO. 949 DEBUG(dbgs() << " - found-scc\n"); 950 951 LoopData::NodeList Headers; 952 LoopData::NodeList Others; 953 findIrreducibleHeaders(BFI, G, SCC, Headers, Others); 954 955 auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), 956 Headers.end(), Others.begin(), Others.end()); 957 958 // Update loop hierarchy. 959 for (const auto &N : Loop->Nodes) 960 if (BFI.Working[N.Index].isLoopHeader()) 961 BFI.Working[N.Index].Loop->Parent = &*Loop; 962 else 963 BFI.Working[N.Index].Loop = &*Loop; 964} 965 966iterator_range<std::list<LoopData>::iterator> 967BlockFrequencyInfoImplBase::analyzeIrreducible( 968 const IrreducibleGraph &G, LoopData *OuterLoop, 969 std::list<LoopData>::iterator Insert) { 970 assert((OuterLoop == nullptr) == (Insert == Loops.begin())); 971 auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); 972 973 for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { 974 if (I->size() < 2) 975 continue; 976 977 // Translate the SCC into RPO. 978 createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); 979 } 980 981 if (OuterLoop) 982 return make_range(std::next(Prev), Insert); 983 return make_range(Loops.begin(), Insert); 984} 985 986void 987BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { 988 OuterLoop.Exits.clear(); 989 OuterLoop.BackedgeMass = BlockMass::getEmpty(); 990 auto O = OuterLoop.Nodes.begin() + 1; 991 for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) 992 if (!Working[I->Index].isPackaged()) 993 *O++ = *I; 994 OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); 995} 996