BlockFrequencyInfoImpl.h revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// Shared implementation of BlockFrequency for IR and Machine Instructions. 11// See the documentation below for BlockFrequencyInfoImpl for details. 12// 13//===----------------------------------------------------------------------===// 14 15#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H 16#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H 17 18#include "llvm/ADT/DenseMap.h" 19#include "llvm/ADT/PostOrderIterator.h" 20#include "llvm/ADT/iterator_range.h" 21#include "llvm/IR/BasicBlock.h" 22#include "llvm/Support/BlockFrequency.h" 23#include "llvm/Support/BranchProbability.h" 24#include "llvm/Support/Debug.h" 25#include "llvm/Support/raw_ostream.h" 26#include <deque> 27#include <list> 28#include <string> 29#include <vector> 30 31#define DEBUG_TYPE "block-freq" 32 33//===----------------------------------------------------------------------===// 34// 35// UnsignedFloat definition. 36// 37// TODO: Make this private to BlockFrequencyInfoImpl or delete. 38// 39//===----------------------------------------------------------------------===// 40namespace llvm { 41 42class UnsignedFloatBase { 43public: 44 static const int32_t MaxExponent = 16383; 45 static const int32_t MinExponent = -16382; 46 static const int DefaultPrecision = 10; 47 48 static void dump(uint64_t D, int16_t E, int Width); 49 static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, 50 unsigned Precision); 51 static std::string toString(uint64_t D, int16_t E, int Width, 52 unsigned Precision); 53 static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); } 54 static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); } 55 static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } 56 57 static std::pair<uint64_t, bool> splitSigned(int64_t N) { 58 if (N >= 0) 59 return std::make_pair(N, false); 60 uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N); 61 return std::make_pair(Unsigned, true); 62 } 63 static int64_t joinSigned(uint64_t U, bool IsNeg) { 64 if (U > uint64_t(INT64_MAX)) 65 return IsNeg ? INT64_MIN : INT64_MAX; 66 return IsNeg ? -int64_t(U) : int64_t(U); 67 } 68 69 static int32_t extractLg(const std::pair<int32_t, int> &Lg) { 70 return Lg.first; 71 } 72 static int32_t extractLgFloor(const std::pair<int32_t, int> &Lg) { 73 return Lg.first - (Lg.second > 0); 74 } 75 static int32_t extractLgCeiling(const std::pair<int32_t, int> &Lg) { 76 return Lg.first + (Lg.second < 0); 77 } 78 79 static std::pair<uint64_t, int16_t> divide64(uint64_t L, uint64_t R); 80 static std::pair<uint64_t, int16_t> multiply64(uint64_t L, uint64_t R); 81 82 static int compare(uint64_t L, uint64_t R, int Shift) { 83 assert(Shift >= 0); 84 assert(Shift < 64); 85 86 uint64_t L_adjusted = L >> Shift; 87 if (L_adjusted < R) 88 return -1; 89 if (L_adjusted > R) 90 return 1; 91 92 return L > L_adjusted << Shift ? 1 : 0; 93 } 94}; 95 96/// \brief Simple representation of an unsigned floating point. 97/// 98/// UnsignedFloat is a unsigned floating point number. It uses simple 99/// saturation arithmetic, and every operation is well-defined for every value. 100/// 101/// The number is split into a signed exponent and unsigned digits. The number 102/// represented is \c getDigits()*2^getExponent(). In this way, the digits are 103/// much like the mantissa in the x87 long double, but there is no canonical 104/// form, so the same number can be represented by many bit representations 105/// (it's always in "denormal" mode). 106/// 107/// UnsignedFloat is templated on the underlying integer type for digits, which 108/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t. 109/// 110/// Unlike builtin floating point types, UnsignedFloat is portable. 111/// 112/// Unlike APFloat, UnsignedFloat does not model architecture floating point 113/// behaviour (this should make it a little faster), and implements most 114/// operators (this makes it usable). 115/// 116/// UnsignedFloat is totally ordered. However, there is no canonical form, so 117/// there are multiple representations of most scalars. E.g.: 118/// 119/// UnsignedFloat(8u, 0) == UnsignedFloat(4u, 1) 120/// UnsignedFloat(4u, 1) == UnsignedFloat(2u, 2) 121/// UnsignedFloat(2u, 2) == UnsignedFloat(1u, 3) 122/// 123/// UnsignedFloat implements most arithmetic operations. Precision is kept 124/// where possible. Uses simple saturation arithmetic, so that operations 125/// saturate to 0.0 or getLargest() rather than under or overflowing. It has 126/// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0. 127/// Any other division by 0.0 is defined to be getLargest(). 128/// 129/// As a convenience for modifying the exponent, left and right shifting are 130/// both implemented, and both interpret negative shifts as positive shifts in 131/// the opposite direction. 132/// 133/// Exponents are limited to the range accepted by x87 long double. This makes 134/// it trivial to add functionality to convert to APFloat (this is already 135/// relied on for the implementation of printing). 136/// 137/// The current plan is to gut this and make the necessary parts of it (even 138/// more) private to BlockFrequencyInfo. 139template <class DigitsT> class UnsignedFloat : UnsignedFloatBase { 140public: 141 static_assert(!std::numeric_limits<DigitsT>::is_signed, 142 "only unsigned floats supported"); 143 144 typedef DigitsT DigitsType; 145 146private: 147 typedef std::numeric_limits<DigitsType> DigitsLimits; 148 149 static const int Width = sizeof(DigitsType) * 8; 150 static_assert(Width <= 64, "invalid integer width for digits"); 151 152private: 153 DigitsType Digits; 154 int16_t Exponent; 155 156public: 157 UnsignedFloat() : Digits(0), Exponent(0) {} 158 159 UnsignedFloat(DigitsType Digits, int16_t Exponent) 160 : Digits(Digits), Exponent(Exponent) {} 161 162private: 163 UnsignedFloat(const std::pair<uint64_t, int16_t> &X) 164 : Digits(X.first), Exponent(X.second) {} 165 166public: 167 static UnsignedFloat getZero() { return UnsignedFloat(0, 0); } 168 static UnsignedFloat getOne() { return UnsignedFloat(1, 0); } 169 static UnsignedFloat getLargest() { 170 return UnsignedFloat(DigitsLimits::max(), MaxExponent); 171 } 172 static UnsignedFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); } 173 static UnsignedFloat getInverseFloat(uint64_t N) { 174 return getFloat(N).invert(); 175 } 176 static UnsignedFloat getFraction(DigitsType N, DigitsType D) { 177 return getQuotient(N, D); 178 } 179 180 int16_t getExponent() const { return Exponent; } 181 DigitsType getDigits() const { return Digits; } 182 183 /// \brief Convert to the given integer type. 184 /// 185 /// Convert to \c IntT using simple saturating arithmetic, truncating if 186 /// necessary. 187 template <class IntT> IntT toInt() const; 188 189 bool isZero() const { return !Digits; } 190 bool isLargest() const { return *this == getLargest(); } 191 bool isOne() const { 192 if (Exponent > 0 || Exponent <= -Width) 193 return false; 194 return Digits == DigitsType(1) << -Exponent; 195 } 196 197 /// \brief The log base 2, rounded. 198 /// 199 /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN. 200 int32_t lg() const { return extractLg(lgImpl()); } 201 202 /// \brief The log base 2, rounded towards INT32_MIN. 203 /// 204 /// Get the lg floor. lg 0 is defined to be INT32_MIN. 205 int32_t lgFloor() const { return extractLgFloor(lgImpl()); } 206 207 /// \brief The log base 2, rounded towards INT32_MAX. 208 /// 209 /// Get the lg ceiling. lg 0 is defined to be INT32_MIN. 210 int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); } 211 212 bool operator==(const UnsignedFloat &X) const { return compare(X) == 0; } 213 bool operator<(const UnsignedFloat &X) const { return compare(X) < 0; } 214 bool operator!=(const UnsignedFloat &X) const { return compare(X) != 0; } 215 bool operator>(const UnsignedFloat &X) const { return compare(X) > 0; } 216 bool operator<=(const UnsignedFloat &X) const { return compare(X) <= 0; } 217 bool operator>=(const UnsignedFloat &X) const { return compare(X) >= 0; } 218 219 bool operator!() const { return isZero(); } 220 221 /// \brief Convert to a decimal representation in a string. 222 /// 223 /// Convert to a string. Uses scientific notation for very large/small 224 /// numbers. Scientific notation is used roughly for numbers outside of the 225 /// range 2^-64 through 2^64. 226 /// 227 /// \c Precision indicates the number of decimal digits of precision to use; 228 /// 0 requests the maximum available. 229 /// 230 /// As a special case to make debugging easier, if the number is small enough 231 /// to convert without scientific notation and has more than \c Precision 232 /// digits before the decimal place, it's printed accurately to the first 233 /// digit past zero. E.g., assuming 10 digits of precision: 234 /// 235 /// 98765432198.7654... => 98765432198.8 236 /// 8765432198.7654... => 8765432198.8 237 /// 765432198.7654... => 765432198.8 238 /// 65432198.7654... => 65432198.77 239 /// 5432198.7654... => 5432198.765 240 std::string toString(unsigned Precision = DefaultPrecision) { 241 return UnsignedFloatBase::toString(Digits, Exponent, Width, Precision); 242 } 243 244 /// \brief Print a decimal representation. 245 /// 246 /// Print a string. See toString for documentation. 247 raw_ostream &print(raw_ostream &OS, 248 unsigned Precision = DefaultPrecision) const { 249 return UnsignedFloatBase::print(OS, Digits, Exponent, Width, Precision); 250 } 251 void dump() const { return UnsignedFloatBase::dump(Digits, Exponent, Width); } 252 253 UnsignedFloat &operator+=(const UnsignedFloat &X); 254 UnsignedFloat &operator-=(const UnsignedFloat &X); 255 UnsignedFloat &operator*=(const UnsignedFloat &X); 256 UnsignedFloat &operator/=(const UnsignedFloat &X); 257 UnsignedFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; } 258 UnsignedFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; } 259 260private: 261 void shiftLeft(int32_t Shift); 262 void shiftRight(int32_t Shift); 263 264 /// \brief Adjust two floats to have matching exponents. 265 /// 266 /// Adjust \c this and \c X to have matching exponents. Returns the new \c X 267 /// by value. Does nothing if \a isZero() for either. 268 /// 269 /// The value that compares smaller will lose precision, and possibly become 270 /// \a isZero(). 271 UnsignedFloat matchExponents(UnsignedFloat X); 272 273 /// \brief Increase exponent to match another float. 274 /// 275 /// Increases \c this to have an exponent matching \c X. May decrease the 276 /// exponent of \c X in the process, and \c this may possibly become \a 277 /// isZero(). 278 void increaseExponentToMatch(UnsignedFloat &X, int32_t ExponentDiff); 279 280public: 281 /// \brief Scale a large number accurately. 282 /// 283 /// Scale N (multiply it by this). Uses full precision multiplication, even 284 /// if Width is smaller than 64, so information is not lost. 285 uint64_t scale(uint64_t N) const; 286 uint64_t scaleByInverse(uint64_t N) const { 287 // TODO: implement directly, rather than relying on inverse. Inverse is 288 // expensive. 289 return inverse().scale(N); 290 } 291 int64_t scale(int64_t N) const { 292 std::pair<uint64_t, bool> Unsigned = splitSigned(N); 293 return joinSigned(scale(Unsigned.first), Unsigned.second); 294 } 295 int64_t scaleByInverse(int64_t N) const { 296 std::pair<uint64_t, bool> Unsigned = splitSigned(N); 297 return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second); 298 } 299 300 int compare(const UnsignedFloat &X) const; 301 int compareTo(uint64_t N) const { 302 UnsignedFloat Float = getFloat(N); 303 int Compare = compare(Float); 304 if (Width == 64 || Compare != 0) 305 return Compare; 306 307 // Check for precision loss. We know *this == RoundTrip. 308 uint64_t RoundTrip = Float.template toInt<uint64_t>(); 309 return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1; 310 } 311 int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); } 312 313 UnsignedFloat &invert() { return *this = UnsignedFloat::getFloat(1) / *this; } 314 UnsignedFloat inverse() const { return UnsignedFloat(*this).invert(); } 315 316private: 317 static UnsignedFloat getProduct(DigitsType L, DigitsType R); 318 static UnsignedFloat getQuotient(DigitsType Dividend, DigitsType Divisor); 319 320 std::pair<int32_t, int> lgImpl() const; 321 static int countLeadingZerosWidth(DigitsType Digits) { 322 if (Width == 64) 323 return countLeadingZeros64(Digits); 324 if (Width == 32) 325 return countLeadingZeros32(Digits); 326 return countLeadingZeros32(Digits) + Width - 32; 327 } 328 329 static UnsignedFloat adjustToWidth(uint64_t N, int32_t S) { 330 assert(S >= MinExponent); 331 assert(S <= MaxExponent); 332 if (Width == 64 || N <= DigitsLimits::max()) 333 return UnsignedFloat(N, S); 334 335 // Shift right. 336 int Shift = 64 - Width - countLeadingZeros64(N); 337 DigitsType Shifted = N >> Shift; 338 339 // Round. 340 assert(S + Shift <= MaxExponent); 341 return getRounded(UnsignedFloat(Shifted, S + Shift), 342 N & UINT64_C(1) << (Shift - 1)); 343 } 344 345 static UnsignedFloat getRounded(UnsignedFloat P, bool Round) { 346 if (!Round) 347 return P; 348 if (P.Digits == DigitsLimits::max()) 349 // Careful of overflow in the exponent. 350 return UnsignedFloat(1, P.Exponent) <<= Width; 351 return UnsignedFloat(P.Digits + 1, P.Exponent); 352 } 353}; 354 355#define UNSIGNED_FLOAT_BOP(op, base) \ 356 template <class DigitsT> \ 357 UnsignedFloat<DigitsT> operator op(const UnsignedFloat<DigitsT> &L, \ 358 const UnsignedFloat<DigitsT> &R) { \ 359 return UnsignedFloat<DigitsT>(L) base R; \ 360 } 361UNSIGNED_FLOAT_BOP(+, += ) 362UNSIGNED_FLOAT_BOP(-, -= ) 363UNSIGNED_FLOAT_BOP(*, *= ) 364UNSIGNED_FLOAT_BOP(/, /= ) 365UNSIGNED_FLOAT_BOP(<<, <<= ) 366UNSIGNED_FLOAT_BOP(>>, >>= ) 367#undef UNSIGNED_FLOAT_BOP 368 369template <class DigitsT> 370raw_ostream &operator<<(raw_ostream &OS, const UnsignedFloat<DigitsT> &X) { 371 return X.print(OS, 10); 372} 373 374#define UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, T1, T2) \ 375 template <class DigitsT> \ 376 bool operator op(const UnsignedFloat<DigitsT> &L, T1 R) { \ 377 return L.compareTo(T2(R)) op 0; \ 378 } \ 379 template <class DigitsT> \ 380 bool operator op(T1 L, const UnsignedFloat<DigitsT> &R) { \ 381 return 0 op R.compareTo(T2(L)); \ 382 } 383#define UNSIGNED_FLOAT_COMPARE_TO(op) \ 384 UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t) \ 385 UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t) \ 386 UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t) \ 387 UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t) 388UNSIGNED_FLOAT_COMPARE_TO(< ) 389UNSIGNED_FLOAT_COMPARE_TO(> ) 390UNSIGNED_FLOAT_COMPARE_TO(== ) 391UNSIGNED_FLOAT_COMPARE_TO(!= ) 392UNSIGNED_FLOAT_COMPARE_TO(<= ) 393UNSIGNED_FLOAT_COMPARE_TO(>= ) 394#undef UNSIGNED_FLOAT_COMPARE_TO 395#undef UNSIGNED_FLOAT_COMPARE_TO_TYPE 396 397template <class DigitsT> 398uint64_t UnsignedFloat<DigitsT>::scale(uint64_t N) const { 399 if (Width == 64 || N <= DigitsLimits::max()) 400 return (getFloat(N) * *this).template toInt<uint64_t>(); 401 402 // Defer to the 64-bit version. 403 return UnsignedFloat<uint64_t>(Digits, Exponent).scale(N); 404} 405 406template <class DigitsT> 407UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getProduct(DigitsType L, 408 DigitsType R) { 409 // Check for zero. 410 if (!L || !R) 411 return getZero(); 412 413 // Check for numbers that we can compute with 64-bit math. 414 if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX)) 415 return adjustToWidth(uint64_t(L) * uint64_t(R), 0); 416 417 // Do the full thing. 418 return UnsignedFloat(multiply64(L, R)); 419} 420template <class DigitsT> 421UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getQuotient(DigitsType Dividend, 422 DigitsType Divisor) { 423 // Check for zero. 424 if (!Dividend) 425 return getZero(); 426 if (!Divisor) 427 return getLargest(); 428 429 if (Width == 64) 430 return UnsignedFloat(divide64(Dividend, Divisor)); 431 432 // We can compute this with 64-bit math. 433 int Shift = countLeadingZeros64(Dividend); 434 uint64_t Shifted = uint64_t(Dividend) << Shift; 435 uint64_t Quotient = Shifted / Divisor; 436 437 // If Quotient needs to be shifted, then adjustToWidth will round. 438 if (Quotient > DigitsLimits::max()) 439 return adjustToWidth(Quotient, -Shift); 440 441 // Round based on the value of the next bit. 442 return getRounded(UnsignedFloat(Quotient, -Shift), 443 Shifted % Divisor >= getHalf(Divisor)); 444} 445 446template <class DigitsT> 447template <class IntT> 448IntT UnsignedFloat<DigitsT>::toInt() const { 449 typedef std::numeric_limits<IntT> Limits; 450 if (*this < 1) 451 return 0; 452 if (*this >= Limits::max()) 453 return Limits::max(); 454 455 IntT N = Digits; 456 if (Exponent > 0) { 457 assert(size_t(Exponent) < sizeof(IntT) * 8); 458 return N << Exponent; 459 } 460 if (Exponent < 0) { 461 assert(size_t(-Exponent) < sizeof(IntT) * 8); 462 return N >> -Exponent; 463 } 464 return N; 465} 466 467template <class DigitsT> 468std::pair<int32_t, int> UnsignedFloat<DigitsT>::lgImpl() const { 469 if (isZero()) 470 return std::make_pair(INT32_MIN, 0); 471 472 // Get the floor of the lg of Digits. 473 int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1; 474 475 // Get the floor of the lg of this. 476 int32_t Floor = Exponent + LocalFloor; 477 if (Digits == UINT64_C(1) << LocalFloor) 478 return std::make_pair(Floor, 0); 479 480 // Round based on the next digit. 481 assert(LocalFloor >= 1); 482 bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); 483 return std::make_pair(Floor + Round, Round ? 1 : -1); 484} 485 486template <class DigitsT> 487UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::matchExponents(UnsignedFloat X) { 488 if (isZero() || X.isZero() || Exponent == X.Exponent) 489 return X; 490 491 int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent); 492 if (Diff > 0) 493 increaseExponentToMatch(X, Diff); 494 else 495 X.increaseExponentToMatch(*this, -Diff); 496 return X; 497} 498template <class DigitsT> 499void UnsignedFloat<DigitsT>::increaseExponentToMatch(UnsignedFloat &X, 500 int32_t ExponentDiff) { 501 assert(ExponentDiff > 0); 502 if (ExponentDiff >= 2 * Width) { 503 *this = getZero(); 504 return; 505 } 506 507 // Use up any leading zeros on X, and then shift this. 508 int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff); 509 assert(ShiftX < Width); 510 511 int32_t ShiftThis = ExponentDiff - ShiftX; 512 if (ShiftThis >= Width) { 513 *this = getZero(); 514 return; 515 } 516 517 X.Digits <<= ShiftX; 518 X.Exponent -= ShiftX; 519 Digits >>= ShiftThis; 520 Exponent += ShiftThis; 521 return; 522} 523 524template <class DigitsT> 525UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>:: 526operator+=(const UnsignedFloat &X) { 527 if (isLargest() || X.isZero()) 528 return *this; 529 if (isZero() || X.isLargest()) 530 return *this = X; 531 532 // Normalize exponents. 533 UnsignedFloat Scaled = matchExponents(X); 534 535 // Check for zero again. 536 if (isZero()) 537 return *this = Scaled; 538 if (Scaled.isZero()) 539 return *this; 540 541 // Compute sum. 542 DigitsType Sum = Digits + Scaled.Digits; 543 bool DidOverflow = Sum < Digits; 544 Digits = Sum; 545 if (!DidOverflow) 546 return *this; 547 548 if (Exponent == MaxExponent) 549 return *this = getLargest(); 550 551 ++Exponent; 552 Digits = UINT64_C(1) << (Width - 1) | Digits >> 1; 553 554 return *this; 555} 556template <class DigitsT> 557UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>:: 558operator-=(const UnsignedFloat &X) { 559 if (X.isZero()) 560 return *this; 561 if (*this <= X) 562 return *this = getZero(); 563 564 // Normalize exponents. 565 UnsignedFloat Scaled = matchExponents(X); 566 assert(Digits >= Scaled.Digits); 567 568 // Compute difference. 569 if (!Scaled.isZero()) { 570 Digits -= Scaled.Digits; 571 return *this; 572 } 573 574 // Check if X just barely lost its last bit. E.g., for 32-bit: 575 // 576 // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 577 if (*this == UnsignedFloat(1, X.lgFloor() + Width)) { 578 Digits = DigitsType(0) - 1; 579 --Exponent; 580 } 581 return *this; 582} 583template <class DigitsT> 584UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>:: 585operator*=(const UnsignedFloat &X) { 586 if (isZero()) 587 return *this; 588 if (X.isZero()) 589 return *this = X; 590 591 // Save the exponents. 592 int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent); 593 594 // Get the raw product. 595 *this = getProduct(Digits, X.Digits); 596 597 // Combine with exponents. 598 return *this <<= Exponents; 599} 600template <class DigitsT> 601UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>:: 602operator/=(const UnsignedFloat &X) { 603 if (isZero()) 604 return *this; 605 if (X.isZero()) 606 return *this = getLargest(); 607 608 // Save the exponents. 609 int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent); 610 611 // Get the raw quotient. 612 *this = getQuotient(Digits, X.Digits); 613 614 // Combine with exponents. 615 return *this <<= Exponents; 616} 617template <class DigitsT> 618void UnsignedFloat<DigitsT>::shiftLeft(int32_t Shift) { 619 if (!Shift || isZero()) 620 return; 621 assert(Shift != INT32_MIN); 622 if (Shift < 0) { 623 shiftRight(-Shift); 624 return; 625 } 626 627 // Shift as much as we can in the exponent. 628 int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent); 629 Exponent += ExponentShift; 630 if (ExponentShift == Shift) 631 return; 632 633 // Check this late, since it's rare. 634 if (isLargest()) 635 return; 636 637 // Shift the digits themselves. 638 Shift -= ExponentShift; 639 if (Shift > countLeadingZerosWidth(Digits)) { 640 // Saturate. 641 *this = getLargest(); 642 return; 643 } 644 645 Digits <<= Shift; 646 return; 647} 648 649template <class DigitsT> 650void UnsignedFloat<DigitsT>::shiftRight(int32_t Shift) { 651 if (!Shift || isZero()) 652 return; 653 assert(Shift != INT32_MIN); 654 if (Shift < 0) { 655 shiftLeft(-Shift); 656 return; 657 } 658 659 // Shift as much as we can in the exponent. 660 int32_t ExponentShift = std::min(Shift, Exponent - MinExponent); 661 Exponent -= ExponentShift; 662 if (ExponentShift == Shift) 663 return; 664 665 // Shift the digits themselves. 666 Shift -= ExponentShift; 667 if (Shift >= Width) { 668 // Saturate. 669 *this = getZero(); 670 return; 671 } 672 673 Digits >>= Shift; 674 return; 675} 676 677template <class DigitsT> 678int UnsignedFloat<DigitsT>::compare(const UnsignedFloat &X) const { 679 // Check for zero. 680 if (isZero()) 681 return X.isZero() ? 0 : -1; 682 if (X.isZero()) 683 return 1; 684 685 // Check for the scale. Use lgFloor to be sure that the exponent difference 686 // is always lower than 64. 687 int32_t lgL = lgFloor(), lgR = X.lgFloor(); 688 if (lgL != lgR) 689 return lgL < lgR ? -1 : 1; 690 691 // Compare digits. 692 if (Exponent < X.Exponent) 693 return UnsignedFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent); 694 695 return -UnsignedFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent); 696} 697 698template <class T> struct isPodLike<UnsignedFloat<T>> { 699 static const bool value = true; 700}; 701} 702 703//===----------------------------------------------------------------------===// 704// 705// BlockMass definition. 706// 707// TODO: Make this private to BlockFrequencyInfoImpl or delete. 708// 709//===----------------------------------------------------------------------===// 710namespace llvm { 711 712/// \brief Mass of a block. 713/// 714/// This class implements a sort of fixed-point fraction always between 0.0 and 715/// 1.0. getMass() == UINT64_MAX indicates a value of 1.0. 716/// 717/// Masses can be added and subtracted. Simple saturation arithmetic is used, 718/// so arithmetic operations never overflow or underflow. 719/// 720/// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses 721/// an inexpensive floating-point algorithm that's off-by-one (almost, but not 722/// quite, maximum precision). 723/// 724/// Masses can be scaled by \a BranchProbability at maximum precision. 725class BlockMass { 726 uint64_t Mass; 727 728public: 729 BlockMass() : Mass(0) {} 730 explicit BlockMass(uint64_t Mass) : Mass(Mass) {} 731 732 static BlockMass getEmpty() { return BlockMass(); } 733 static BlockMass getFull() { return BlockMass(UINT64_MAX); } 734 735 uint64_t getMass() const { return Mass; } 736 737 bool isFull() const { return Mass == UINT64_MAX; } 738 bool isEmpty() const { return !Mass; } 739 740 bool operator!() const { return isEmpty(); } 741 742 /// \brief Add another mass. 743 /// 744 /// Adds another mass, saturating at \a isFull() rather than overflowing. 745 BlockMass &operator+=(const BlockMass &X) { 746 uint64_t Sum = Mass + X.Mass; 747 Mass = Sum < Mass ? UINT64_MAX : Sum; 748 return *this; 749 } 750 751 /// \brief Subtract another mass. 752 /// 753 /// Subtracts another mass, saturating at \a isEmpty() rather than 754 /// undeflowing. 755 BlockMass &operator-=(const BlockMass &X) { 756 uint64_t Diff = Mass - X.Mass; 757 Mass = Diff > Mass ? 0 : Diff; 758 return *this; 759 } 760 761 BlockMass &operator*=(const BranchProbability &P) { 762 Mass = P.scale(Mass); 763 return *this; 764 } 765 766 bool operator==(const BlockMass &X) const { return Mass == X.Mass; } 767 bool operator!=(const BlockMass &X) const { return Mass != X.Mass; } 768 bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; } 769 bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; } 770 bool operator<(const BlockMass &X) const { return Mass < X.Mass; } 771 bool operator>(const BlockMass &X) const { return Mass > X.Mass; } 772 773 /// \brief Convert to floating point. 774 /// 775 /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives 776 /// slightly above 0.0. 777 UnsignedFloat<uint64_t> toFloat() const; 778 779 void dump() const; 780 raw_ostream &print(raw_ostream &OS) const; 781}; 782 783inline BlockMass operator+(const BlockMass &L, const BlockMass &R) { 784 return BlockMass(L) += R; 785} 786inline BlockMass operator-(const BlockMass &L, const BlockMass &R) { 787 return BlockMass(L) -= R; 788} 789inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) { 790 return BlockMass(L) *= R; 791} 792inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) { 793 return BlockMass(R) *= L; 794} 795 796inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) { 797 return X.print(OS); 798} 799 800template <> struct isPodLike<BlockMass> { 801 static const bool value = true; 802}; 803} 804 805//===----------------------------------------------------------------------===// 806// 807// BlockFrequencyInfoImpl definition. 808// 809//===----------------------------------------------------------------------===// 810namespace llvm { 811 812class BasicBlock; 813class BranchProbabilityInfo; 814class Function; 815class Loop; 816class LoopInfo; 817class MachineBasicBlock; 818class MachineBranchProbabilityInfo; 819class MachineFunction; 820class MachineLoop; 821class MachineLoopInfo; 822 823namespace bfi_detail { 824struct IrreducibleGraph; 825 826// This is part of a workaround for a GCC 4.7 crash on lambdas. 827template <class BT> struct BlockEdgesAdder; 828} 829 830/// \brief Base class for BlockFrequencyInfoImpl 831/// 832/// BlockFrequencyInfoImplBase has supporting data structures and some 833/// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on 834/// the block type (or that call such algorithms) are skipped here. 835/// 836/// Nevertheless, the majority of the overall algorithm documention lives with 837/// BlockFrequencyInfoImpl. See there for details. 838class BlockFrequencyInfoImplBase { 839public: 840 typedef UnsignedFloat<uint64_t> Float; 841 842 /// \brief Representative of a block. 843 /// 844 /// This is a simple wrapper around an index into the reverse-post-order 845 /// traversal of the blocks. 846 /// 847 /// Unlike a block pointer, its order has meaning (location in the 848 /// topological sort) and it's class is the same regardless of block type. 849 struct BlockNode { 850 typedef uint32_t IndexType; 851 IndexType Index; 852 853 bool operator==(const BlockNode &X) const { return Index == X.Index; } 854 bool operator!=(const BlockNode &X) const { return Index != X.Index; } 855 bool operator<=(const BlockNode &X) const { return Index <= X.Index; } 856 bool operator>=(const BlockNode &X) const { return Index >= X.Index; } 857 bool operator<(const BlockNode &X) const { return Index < X.Index; } 858 bool operator>(const BlockNode &X) const { return Index > X.Index; } 859 860 BlockNode() : Index(UINT32_MAX) {} 861 BlockNode(IndexType Index) : Index(Index) {} 862 863 bool isValid() const { return Index <= getMaxIndex(); } 864 static size_t getMaxIndex() { return UINT32_MAX - 1; } 865 }; 866 867 /// \brief Stats about a block itself. 868 struct FrequencyData { 869 Float Floating; 870 uint64_t Integer; 871 }; 872 873 /// \brief Data about a loop. 874 /// 875 /// Contains the data necessary to represent represent a loop as a 876 /// pseudo-node once it's packaged. 877 struct LoopData { 878 typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap; 879 typedef SmallVector<BlockNode, 4> NodeList; 880 LoopData *Parent; ///< The parent loop. 881 bool IsPackaged; ///< Whether this has been packaged. 882 uint32_t NumHeaders; ///< Number of headers. 883 ExitMap Exits; ///< Successor edges (and weights). 884 NodeList Nodes; ///< Header and the members of the loop. 885 BlockMass BackedgeMass; ///< Mass returned to loop header. 886 BlockMass Mass; 887 Float Scale; 888 889 LoopData(LoopData *Parent, const BlockNode &Header) 890 : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {} 891 template <class It1, class It2> 892 LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther, 893 It2 LastOther) 894 : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) { 895 NumHeaders = Nodes.size(); 896 Nodes.insert(Nodes.end(), FirstOther, LastOther); 897 } 898 bool isHeader(const BlockNode &Node) const { 899 if (isIrreducible()) 900 return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders, 901 Node); 902 return Node == Nodes[0]; 903 } 904 BlockNode getHeader() const { return Nodes[0]; } 905 bool isIrreducible() const { return NumHeaders > 1; } 906 907 NodeList::const_iterator members_begin() const { 908 return Nodes.begin() + NumHeaders; 909 } 910 NodeList::const_iterator members_end() const { return Nodes.end(); } 911 iterator_range<NodeList::const_iterator> members() const { 912 return make_range(members_begin(), members_end()); 913 } 914 }; 915 916 /// \brief Index of loop information. 917 struct WorkingData { 918 BlockNode Node; ///< This node. 919 LoopData *Loop; ///< The loop this block is inside. 920 BlockMass Mass; ///< Mass distribution from the entry block. 921 922 WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {} 923 924 bool isLoopHeader() const { return Loop && Loop->isHeader(Node); } 925 bool isDoubleLoopHeader() const { 926 return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() && 927 Loop->Parent->isHeader(Node); 928 } 929 930 LoopData *getContainingLoop() const { 931 if (!isLoopHeader()) 932 return Loop; 933 if (!isDoubleLoopHeader()) 934 return Loop->Parent; 935 return Loop->Parent->Parent; 936 } 937 938 /// \brief Resolve a node to its representative. 939 /// 940 /// Get the node currently representing Node, which could be a containing 941 /// loop. 942 /// 943 /// This function should only be called when distributing mass. As long as 944 /// there are no irreducilbe edges to Node, then it will have complexity 945 /// O(1) in this context. 946 /// 947 /// In general, the complexity is O(L), where L is the number of loop 948 /// headers Node has been packaged into. Since this method is called in 949 /// the context of distributing mass, L will be the number of loop headers 950 /// an early exit edge jumps out of. 951 BlockNode getResolvedNode() const { 952 auto L = getPackagedLoop(); 953 return L ? L->getHeader() : Node; 954 } 955 LoopData *getPackagedLoop() const { 956 if (!Loop || !Loop->IsPackaged) 957 return nullptr; 958 auto L = Loop; 959 while (L->Parent && L->Parent->IsPackaged) 960 L = L->Parent; 961 return L; 962 } 963 964 /// \brief Get the appropriate mass for a node. 965 /// 966 /// Get appropriate mass for Node. If Node is a loop-header (whose loop 967 /// has been packaged), returns the mass of its pseudo-node. If it's a 968 /// node inside a packaged loop, it returns the loop's mass. 969 BlockMass &getMass() { 970 if (!isAPackage()) 971 return Mass; 972 if (!isADoublePackage()) 973 return Loop->Mass; 974 return Loop->Parent->Mass; 975 } 976 977 /// \brief Has ContainingLoop been packaged up? 978 bool isPackaged() const { return getResolvedNode() != Node; } 979 /// \brief Has Loop been packaged up? 980 bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; } 981 /// \brief Has Loop been packaged up twice? 982 bool isADoublePackage() const { 983 return isDoubleLoopHeader() && Loop->Parent->IsPackaged; 984 } 985 }; 986 987 /// \brief Unscaled probability weight. 988 /// 989 /// Probability weight for an edge in the graph (including the 990 /// successor/target node). 991 /// 992 /// All edges in the original function are 32-bit. However, exit edges from 993 /// loop packages are taken from 64-bit exit masses, so we need 64-bits of 994 /// space in general. 995 /// 996 /// In addition to the raw weight amount, Weight stores the type of the edge 997 /// in the current context (i.e., the context of the loop being processed). 998 /// Is this a local edge within the loop, an exit from the loop, or a 999 /// backedge to the loop header? 1000 struct Weight { 1001 enum DistType { Local, Exit, Backedge }; 1002 DistType Type; 1003 BlockNode TargetNode; 1004 uint64_t Amount; 1005 Weight() : Type(Local), Amount(0) {} 1006 }; 1007 1008 /// \brief Distribution of unscaled probability weight. 1009 /// 1010 /// Distribution of unscaled probability weight to a set of successors. 1011 /// 1012 /// This class collates the successor edge weights for later processing. 1013 /// 1014 /// \a DidOverflow indicates whether \a Total did overflow while adding to 1015 /// the distribution. It should never overflow twice. 1016 struct Distribution { 1017 typedef SmallVector<Weight, 4> WeightList; 1018 WeightList Weights; ///< Individual successor weights. 1019 uint64_t Total; ///< Sum of all weights. 1020 bool DidOverflow; ///< Whether \a Total did overflow. 1021 1022 Distribution() : Total(0), DidOverflow(false) {} 1023 void addLocal(const BlockNode &Node, uint64_t Amount) { 1024 add(Node, Amount, Weight::Local); 1025 } 1026 void addExit(const BlockNode &Node, uint64_t Amount) { 1027 add(Node, Amount, Weight::Exit); 1028 } 1029 void addBackedge(const BlockNode &Node, uint64_t Amount) { 1030 add(Node, Amount, Weight::Backedge); 1031 } 1032 1033 /// \brief Normalize the distribution. 1034 /// 1035 /// Combines multiple edges to the same \a Weight::TargetNode and scales 1036 /// down so that \a Total fits into 32-bits. 1037 /// 1038 /// This is linear in the size of \a Weights. For the vast majority of 1039 /// cases, adjacent edge weights are combined by sorting WeightList and 1040 /// combining adjacent weights. However, for very large edge lists an 1041 /// auxiliary hash table is used. 1042 void normalize(); 1043 1044 private: 1045 void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type); 1046 }; 1047 1048 /// \brief Data about each block. This is used downstream. 1049 std::vector<FrequencyData> Freqs; 1050 1051 /// \brief Loop data: see initializeLoops(). 1052 std::vector<WorkingData> Working; 1053 1054 /// \brief Indexed information about loops. 1055 std::list<LoopData> Loops; 1056 1057 /// \brief Add all edges out of a packaged loop to the distribution. 1058 /// 1059 /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each 1060 /// successor edge. 1061 /// 1062 /// \return \c true unless there's an irreducible backedge. 1063 bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop, 1064 Distribution &Dist); 1065 1066 /// \brief Add an edge to the distribution. 1067 /// 1068 /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the 1069 /// edge is local/exit/backedge is in the context of LoopHead. Otherwise, 1070 /// every edge should be a local edge (since all the loops are packaged up). 1071 /// 1072 /// \return \c true unless aborted due to an irreducible backedge. 1073 bool addToDist(Distribution &Dist, const LoopData *OuterLoop, 1074 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight); 1075 1076 LoopData &getLoopPackage(const BlockNode &Head) { 1077 assert(Head.Index < Working.size()); 1078 assert(Working[Head.Index].isLoopHeader()); 1079 return *Working[Head.Index].Loop; 1080 } 1081 1082 /// \brief Analyze irreducible SCCs. 1083 /// 1084 /// Separate irreducible SCCs from \c G, which is an explict graph of \c 1085 /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr). 1086 /// Insert them into \a Loops before \c Insert. 1087 /// 1088 /// \return the \c LoopData nodes representing the irreducible SCCs. 1089 iterator_range<std::list<LoopData>::iterator> 1090 analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop, 1091 std::list<LoopData>::iterator Insert); 1092 1093 /// \brief Update a loop after packaging irreducible SCCs inside of it. 1094 /// 1095 /// Update \c OuterLoop. Before finding irreducible control flow, it was 1096 /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a 1097 /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged 1098 /// up need to be removed from \a OuterLoop::Nodes. 1099 void updateLoopWithIrreducible(LoopData &OuterLoop); 1100 1101 /// \brief Distribute mass according to a distribution. 1102 /// 1103 /// Distributes the mass in Source according to Dist. If LoopHead.isValid(), 1104 /// backedges and exits are stored in its entry in Loops. 1105 /// 1106 /// Mass is distributed in parallel from two copies of the source mass. 1107 void distributeMass(const BlockNode &Source, LoopData *OuterLoop, 1108 Distribution &Dist); 1109 1110 /// \brief Compute the loop scale for a loop. 1111 void computeLoopScale(LoopData &Loop); 1112 1113 /// \brief Package up a loop. 1114 void packageLoop(LoopData &Loop); 1115 1116 /// \brief Unwrap loops. 1117 void unwrapLoops(); 1118 1119 /// \brief Finalize frequency metrics. 1120 /// 1121 /// Calculates final frequencies and cleans up no-longer-needed data 1122 /// structures. 1123 void finalizeMetrics(); 1124 1125 /// \brief Clear all memory. 1126 void clear(); 1127 1128 virtual std::string getBlockName(const BlockNode &Node) const; 1129 std::string getLoopName(const LoopData &Loop) const; 1130 1131 virtual raw_ostream &print(raw_ostream &OS) const { return OS; } 1132 void dump() const { print(dbgs()); } 1133 1134 Float getFloatingBlockFreq(const BlockNode &Node) const; 1135 1136 BlockFrequency getBlockFreq(const BlockNode &Node) const; 1137 1138 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const; 1139 raw_ostream &printBlockFreq(raw_ostream &OS, 1140 const BlockFrequency &Freq) const; 1141 1142 uint64_t getEntryFreq() const { 1143 assert(!Freqs.empty()); 1144 return Freqs[0].Integer; 1145 } 1146 /// \brief Virtual destructor. 1147 /// 1148 /// Need a virtual destructor to mask the compiler warning about 1149 /// getBlockName(). 1150 virtual ~BlockFrequencyInfoImplBase() {} 1151}; 1152 1153namespace bfi_detail { 1154template <class BlockT> struct TypeMap {}; 1155template <> struct TypeMap<BasicBlock> { 1156 typedef BasicBlock BlockT; 1157 typedef Function FunctionT; 1158 typedef BranchProbabilityInfo BranchProbabilityInfoT; 1159 typedef Loop LoopT; 1160 typedef LoopInfo LoopInfoT; 1161}; 1162template <> struct TypeMap<MachineBasicBlock> { 1163 typedef MachineBasicBlock BlockT; 1164 typedef MachineFunction FunctionT; 1165 typedef MachineBranchProbabilityInfo BranchProbabilityInfoT; 1166 typedef MachineLoop LoopT; 1167 typedef MachineLoopInfo LoopInfoT; 1168}; 1169 1170/// \brief Get the name of a MachineBasicBlock. 1171/// 1172/// Get the name of a MachineBasicBlock. It's templated so that including from 1173/// CodeGen is unnecessary (that would be a layering issue). 1174/// 1175/// This is used mainly for debug output. The name is similar to 1176/// MachineBasicBlock::getFullName(), but skips the name of the function. 1177template <class BlockT> std::string getBlockName(const BlockT *BB) { 1178 assert(BB && "Unexpected nullptr"); 1179 auto MachineName = "BB" + Twine(BB->getNumber()); 1180 if (BB->getBasicBlock()) 1181 return (MachineName + "[" + BB->getName() + "]").str(); 1182 return MachineName.str(); 1183} 1184/// \brief Get the name of a BasicBlock. 1185template <> inline std::string getBlockName(const BasicBlock *BB) { 1186 assert(BB && "Unexpected nullptr"); 1187 return BB->getName().str(); 1188} 1189 1190/// \brief Graph of irreducible control flow. 1191/// 1192/// This graph is used for determining the SCCs in a loop (or top-level 1193/// function) that has irreducible control flow. 1194/// 1195/// During the block frequency algorithm, the local graphs are defined in a 1196/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock 1197/// graphs for most edges, but getting others from \a LoopData::ExitMap. The 1198/// latter only has successor information. 1199/// 1200/// \a IrreducibleGraph makes this graph explicit. It's in a form that can use 1201/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator), 1202/// and it explicitly lists predecessors and successors. The initialization 1203/// that relies on \c MachineBasicBlock is defined in the header. 1204struct IrreducibleGraph { 1205 typedef BlockFrequencyInfoImplBase BFIBase; 1206 1207 BFIBase &BFI; 1208 1209 typedef BFIBase::BlockNode BlockNode; 1210 struct IrrNode { 1211 BlockNode Node; 1212 unsigned NumIn; 1213 std::deque<const IrrNode *> Edges; 1214 IrrNode(const BlockNode &Node) : Node(Node), NumIn(0) {} 1215 1216 typedef std::deque<const IrrNode *>::const_iterator iterator; 1217 iterator pred_begin() const { return Edges.begin(); } 1218 iterator succ_begin() const { return Edges.begin() + NumIn; } 1219 iterator pred_end() const { return succ_begin(); } 1220 iterator succ_end() const { return Edges.end(); } 1221 }; 1222 BlockNode Start; 1223 const IrrNode *StartIrr; 1224 std::vector<IrrNode> Nodes; 1225 SmallDenseMap<uint32_t, IrrNode *, 4> Lookup; 1226 1227 /// \brief Construct an explicit graph containing irreducible control flow. 1228 /// 1229 /// Construct an explicit graph of the control flow in \c OuterLoop (or the 1230 /// top-level function, if \c OuterLoop is \c nullptr). Uses \c 1231 /// addBlockEdges to add block successors that have not been packaged into 1232 /// loops. 1233 /// 1234 /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected 1235 /// user of this. 1236 template <class BlockEdgesAdder> 1237 IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop, 1238 BlockEdgesAdder addBlockEdges) 1239 : BFI(BFI), StartIrr(nullptr) { 1240 initialize(OuterLoop, addBlockEdges); 1241 } 1242 1243 template <class BlockEdgesAdder> 1244 void initialize(const BFIBase::LoopData *OuterLoop, 1245 BlockEdgesAdder addBlockEdges); 1246 void addNodesInLoop(const BFIBase::LoopData &OuterLoop); 1247 void addNodesInFunction(); 1248 void addNode(const BlockNode &Node) { 1249 Nodes.emplace_back(Node); 1250 BFI.Working[Node.Index].getMass() = BlockMass::getEmpty(); 1251 } 1252 void indexNodes(); 1253 template <class BlockEdgesAdder> 1254 void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop, 1255 BlockEdgesAdder addBlockEdges); 1256 void addEdge(IrrNode &Irr, const BlockNode &Succ, 1257 const BFIBase::LoopData *OuterLoop); 1258}; 1259template <class BlockEdgesAdder> 1260void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop, 1261 BlockEdgesAdder addBlockEdges) { 1262 if (OuterLoop) { 1263 addNodesInLoop(*OuterLoop); 1264 for (auto N : OuterLoop->Nodes) 1265 addEdges(N, OuterLoop, addBlockEdges); 1266 } else { 1267 addNodesInFunction(); 1268 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 1269 addEdges(Index, OuterLoop, addBlockEdges); 1270 } 1271 StartIrr = Lookup[Start.Index]; 1272} 1273template <class BlockEdgesAdder> 1274void IrreducibleGraph::addEdges(const BlockNode &Node, 1275 const BFIBase::LoopData *OuterLoop, 1276 BlockEdgesAdder addBlockEdges) { 1277 auto L = Lookup.find(Node.Index); 1278 if (L == Lookup.end()) 1279 return; 1280 IrrNode &Irr = *L->second; 1281 const auto &Working = BFI.Working[Node.Index]; 1282 1283 if (Working.isAPackage()) 1284 for (const auto &I : Working.Loop->Exits) 1285 addEdge(Irr, I.first, OuterLoop); 1286 else 1287 addBlockEdges(*this, Irr, OuterLoop); 1288} 1289} 1290 1291/// \brief Shared implementation for block frequency analysis. 1292/// 1293/// This is a shared implementation of BlockFrequencyInfo and 1294/// MachineBlockFrequencyInfo, and calculates the relative frequencies of 1295/// blocks. 1296/// 1297/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block, 1298/// which is called the header. A given loop, L, can have sub-loops, which are 1299/// loops within the subgraph of L that exclude its header. (A "trivial" SCC 1300/// consists of a single block that does not have a self-edge.) 1301/// 1302/// In addition to loops, this algorithm has limited support for irreducible 1303/// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are 1304/// discovered on they fly, and modelled as loops with multiple headers. 1305/// 1306/// The headers of irreducible sub-SCCs consist of its entry blocks and all 1307/// nodes that are targets of a backedge within it (excluding backedges within 1308/// true sub-loops). Block frequency calculations act as if a block is 1309/// inserted that intercepts all the edges to the headers. All backedges and 1310/// entries point to this block. Its successors are the headers, which split 1311/// the frequency evenly. 1312/// 1313/// This algorithm leverages BlockMass and UnsignedFloat to maintain precision, 1314/// separates mass distribution from loop scaling, and dithers to eliminate 1315/// probability mass loss. 1316/// 1317/// The implementation is split between BlockFrequencyInfoImpl, which knows the 1318/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and 1319/// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a 1320/// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in 1321/// reverse-post order. This gives two advantages: it's easy to compare the 1322/// relative ordering of two nodes, and maps keyed on BlockT can be represented 1323/// by vectors. 1324/// 1325/// This algorithm is O(V+E), unless there is irreducible control flow, in 1326/// which case it's O(V*E) in the worst case. 1327/// 1328/// These are the main stages: 1329/// 1330/// 0. Reverse post-order traversal (\a initializeRPOT()). 1331/// 1332/// Run a single post-order traversal and save it (in reverse) in RPOT. 1333/// All other stages make use of this ordering. Save a lookup from BlockT 1334/// to BlockNode (the index into RPOT) in Nodes. 1335/// 1336/// 1. Loop initialization (\a initializeLoops()). 1337/// 1338/// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of 1339/// the algorithm. In particular, store the immediate members of each loop 1340/// in reverse post-order. 1341/// 1342/// 2. Calculate mass and scale in loops (\a computeMassInLoops()). 1343/// 1344/// For each loop (bottom-up), distribute mass through the DAG resulting 1345/// from ignoring backedges and treating sub-loops as a single pseudo-node. 1346/// Track the backedge mass distributed to the loop header, and use it to 1347/// calculate the loop scale (number of loop iterations). Immediate 1348/// members that represent sub-loops will already have been visited and 1349/// packaged into a pseudo-node. 1350/// 1351/// Distributing mass in a loop is a reverse-post-order traversal through 1352/// the loop. Start by assigning full mass to the Loop header. For each 1353/// node in the loop: 1354/// 1355/// - Fetch and categorize the weight distribution for its successors. 1356/// If this is a packaged-subloop, the weight distribution is stored 1357/// in \a LoopData::Exits. Otherwise, fetch it from 1358/// BranchProbabilityInfo. 1359/// 1360/// - Each successor is categorized as \a Weight::Local, a local edge 1361/// within the current loop, \a Weight::Backedge, a backedge to the 1362/// loop header, or \a Weight::Exit, any successor outside the loop. 1363/// The weight, the successor, and its category are stored in \a 1364/// Distribution. There can be multiple edges to each successor. 1365/// 1366/// - If there's a backedge to a non-header, there's an irreducible SCC. 1367/// The usual flow is temporarily aborted. \a 1368/// computeIrreducibleMass() finds the irreducible SCCs within the 1369/// loop, packages them up, and restarts the flow. 1370/// 1371/// - Normalize the distribution: scale weights down so that their sum 1372/// is 32-bits, and coalesce multiple edges to the same node. 1373/// 1374/// - Distribute the mass accordingly, dithering to minimize mass loss, 1375/// as described in \a distributeMass(). 1376/// 1377/// Finally, calculate the loop scale from the accumulated backedge mass. 1378/// 1379/// 3. Distribute mass in the function (\a computeMassInFunction()). 1380/// 1381/// Finally, distribute mass through the DAG resulting from packaging all 1382/// loops in the function. This uses the same algorithm as distributing 1383/// mass in a loop, except that there are no exit or backedge edges. 1384/// 1385/// 4. Unpackage loops (\a unwrapLoops()). 1386/// 1387/// Initialize each block's frequency to a floating point representation of 1388/// its mass. 1389/// 1390/// Visit loops top-down, scaling the frequencies of its immediate members 1391/// by the loop's pseudo-node's frequency. 1392/// 1393/// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()). 1394/// 1395/// Using the min and max frequencies as a guide, translate floating point 1396/// frequencies to an appropriate range in uint64_t. 1397/// 1398/// It has some known flaws. 1399/// 1400/// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting 1401/// BlockFrequency's 64-bit integer precision. 1402/// 1403/// - The model of irreducible control flow is a rough approximation. 1404/// 1405/// Modelling irreducible control flow exactly involves setting up and 1406/// solving a group of infinite geometric series. Such precision is 1407/// unlikely to be worthwhile, since most of our algorithms give up on 1408/// irreducible control flow anyway. 1409/// 1410/// Nevertheless, we might find that we need to get closer. Here's a sort 1411/// of TODO list for the model with diminishing returns, to be completed as 1412/// necessary. 1413/// 1414/// - The headers for the \a LoopData representing an irreducible SCC 1415/// include non-entry blocks. When these extra blocks exist, they 1416/// indicate a self-contained irreducible sub-SCC. We could treat them 1417/// as sub-loops, rather than arbitrarily shoving the problematic 1418/// blocks into the headers of the main irreducible SCC. 1419/// 1420/// - Backedge frequencies are assumed to be evenly split between the 1421/// headers of a given irreducible SCC. Instead, we could track the 1422/// backedge mass separately for each header, and adjust their relative 1423/// frequencies. 1424/// 1425/// - Entry frequencies are assumed to be evenly split between the 1426/// headers of a given irreducible SCC, which is the only option if we 1427/// need to compute mass in the SCC before its parent loop. Instead, 1428/// we could partially compute mass in the parent loop, and stop when 1429/// we get to the SCC. Here, we have the correct ratio of entry 1430/// masses, which we can use to adjust their relative frequencies. 1431/// Compute mass in the SCC, and then continue propagation in the 1432/// parent. 1433/// 1434/// - We can propagate mass iteratively through the SCC, for some fixed 1435/// number of iterations. Each iteration starts by assigning the entry 1436/// blocks their backedge mass from the prior iteration. The final 1437/// mass for each block (and each exit, and the total backedge mass 1438/// used for computing loop scale) is the sum of all iterations. 1439/// (Running this until fixed point would "solve" the geometric 1440/// series by simulation.) 1441template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { 1442 typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT; 1443 typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT; 1444 typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT 1445 BranchProbabilityInfoT; 1446 typedef typename bfi_detail::TypeMap<BT>::LoopT LoopT; 1447 typedef typename bfi_detail::TypeMap<BT>::LoopInfoT LoopInfoT; 1448 1449 // This is part of a workaround for a GCC 4.7 crash on lambdas. 1450 friend struct bfi_detail::BlockEdgesAdder<BT>; 1451 1452 typedef GraphTraits<const BlockT *> Successor; 1453 typedef GraphTraits<Inverse<const BlockT *>> Predecessor; 1454 1455 const BranchProbabilityInfoT *BPI; 1456 const LoopInfoT *LI; 1457 const FunctionT *F; 1458 1459 // All blocks in reverse postorder. 1460 std::vector<const BlockT *> RPOT; 1461 DenseMap<const BlockT *, BlockNode> Nodes; 1462 1463 typedef typename std::vector<const BlockT *>::const_iterator rpot_iterator; 1464 1465 rpot_iterator rpot_begin() const { return RPOT.begin(); } 1466 rpot_iterator rpot_end() const { return RPOT.end(); } 1467 1468 size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); } 1469 1470 BlockNode getNode(const rpot_iterator &I) const { 1471 return BlockNode(getIndex(I)); 1472 } 1473 BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); } 1474 1475 const BlockT *getBlock(const BlockNode &Node) const { 1476 assert(Node.Index < RPOT.size()); 1477 return RPOT[Node.Index]; 1478 } 1479 1480 /// \brief Run (and save) a post-order traversal. 1481 /// 1482 /// Saves a reverse post-order traversal of all the nodes in \a F. 1483 void initializeRPOT(); 1484 1485 /// \brief Initialize loop data. 1486 /// 1487 /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from 1488 /// each block to the deepest loop it's in, but we need the inverse. For each 1489 /// loop, we store in reverse post-order its "immediate" members, defined as 1490 /// the header, the headers of immediate sub-loops, and all other blocks in 1491 /// the loop that are not in sub-loops. 1492 void initializeLoops(); 1493 1494 /// \brief Propagate to a block's successors. 1495 /// 1496 /// In the context of distributing mass through \c OuterLoop, divide the mass 1497 /// currently assigned to \c Node between its successors. 1498 /// 1499 /// \return \c true unless there's an irreducible backedge. 1500 bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node); 1501 1502 /// \brief Compute mass in a particular loop. 1503 /// 1504 /// Assign mass to \c Loop's header, and then for each block in \c Loop in 1505 /// reverse post-order, distribute mass to its successors. Only visits nodes 1506 /// that have not been packaged into sub-loops. 1507 /// 1508 /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop. 1509 /// \return \c true unless there's an irreducible backedge. 1510 bool computeMassInLoop(LoopData &Loop); 1511 1512 /// \brief Try to compute mass in the top-level function. 1513 /// 1514 /// Assign mass to the entry block, and then for each block in reverse 1515 /// post-order, distribute mass to its successors. Skips nodes that have 1516 /// been packaged into loops. 1517 /// 1518 /// \pre \a computeMassInLoops() has been called. 1519 /// \return \c true unless there's an irreducible backedge. 1520 bool tryToComputeMassInFunction(); 1521 1522 /// \brief Compute mass in (and package up) irreducible SCCs. 1523 /// 1524 /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front 1525 /// of \c Insert), and call \a computeMassInLoop() on each of them. 1526 /// 1527 /// If \c OuterLoop is \c nullptr, it refers to the top-level function. 1528 /// 1529 /// \pre \a computeMassInLoop() has been called for each subloop of \c 1530 /// OuterLoop. 1531 /// \pre \c Insert points at the the last loop successfully processed by \a 1532 /// computeMassInLoop(). 1533 /// \pre \c OuterLoop has irreducible SCCs. 1534 void computeIrreducibleMass(LoopData *OuterLoop, 1535 std::list<LoopData>::iterator Insert); 1536 1537 /// \brief Compute mass in all loops. 1538 /// 1539 /// For each loop bottom-up, call \a computeMassInLoop(). 1540 /// 1541 /// \a computeMassInLoop() aborts (and returns \c false) on loops that 1542 /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then 1543 /// re-enter \a computeMassInLoop(). 1544 /// 1545 /// \post \a computeMassInLoop() has returned \c true for every loop. 1546 void computeMassInLoops(); 1547 1548 /// \brief Compute mass in the top-level function. 1549 /// 1550 /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to 1551 /// compute mass in the top-level function. 1552 /// 1553 /// \post \a tryToComputeMassInFunction() has returned \c true. 1554 void computeMassInFunction(); 1555 1556 std::string getBlockName(const BlockNode &Node) const override { 1557 return bfi_detail::getBlockName(getBlock(Node)); 1558 } 1559 1560public: 1561 const FunctionT *getFunction() const { return F; } 1562 1563 void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI, 1564 const LoopInfoT *LI); 1565 BlockFrequencyInfoImpl() : BPI(nullptr), LI(nullptr), F(nullptr) {} 1566 1567 using BlockFrequencyInfoImplBase::getEntryFreq; 1568 BlockFrequency getBlockFreq(const BlockT *BB) const { 1569 return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB)); 1570 } 1571 Float getFloatingBlockFreq(const BlockT *BB) const { 1572 return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB)); 1573 } 1574 1575 /// \brief Print the frequencies for the current function. 1576 /// 1577 /// Prints the frequencies for the blocks in the current function. 1578 /// 1579 /// Blocks are printed in the natural iteration order of the function, rather 1580 /// than reverse post-order. This provides two advantages: writing -analyze 1581 /// tests is easier (since blocks come out in source order), and even 1582 /// unreachable blocks are printed. 1583 /// 1584 /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so 1585 /// we need to override it here. 1586 raw_ostream &print(raw_ostream &OS) const override; 1587 using BlockFrequencyInfoImplBase::dump; 1588 1589 using BlockFrequencyInfoImplBase::printBlockFreq; 1590 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const { 1591 return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB)); 1592 } 1593}; 1594 1595template <class BT> 1596void BlockFrequencyInfoImpl<BT>::doFunction(const FunctionT *F, 1597 const BranchProbabilityInfoT *BPI, 1598 const LoopInfoT *LI) { 1599 // Save the parameters. 1600 this->BPI = BPI; 1601 this->LI = LI; 1602 this->F = F; 1603 1604 // Clean up left-over data structures. 1605 BlockFrequencyInfoImplBase::clear(); 1606 RPOT.clear(); 1607 Nodes.clear(); 1608 1609 // Initialize. 1610 DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n=================" 1611 << std::string(F->getName().size(), '=') << "\n"); 1612 initializeRPOT(); 1613 initializeLoops(); 1614 1615 // Visit loops in post-order to find thelocal mass distribution, and then do 1616 // the full function. 1617 computeMassInLoops(); 1618 computeMassInFunction(); 1619 unwrapLoops(); 1620 finalizeMetrics(); 1621} 1622 1623template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() { 1624 const BlockT *Entry = F->begin(); 1625 RPOT.reserve(F->size()); 1626 std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT)); 1627 std::reverse(RPOT.begin(), RPOT.end()); 1628 1629 assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() && 1630 "More nodes in function than Block Frequency Info supports"); 1631 1632 DEBUG(dbgs() << "reverse-post-order-traversal\n"); 1633 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { 1634 BlockNode Node = getNode(I); 1635 DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n"); 1636 Nodes[*I] = Node; 1637 } 1638 1639 Working.reserve(RPOT.size()); 1640 for (size_t Index = 0; Index < RPOT.size(); ++Index) 1641 Working.emplace_back(Index); 1642 Freqs.resize(RPOT.size()); 1643} 1644 1645template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() { 1646 DEBUG(dbgs() << "loop-detection\n"); 1647 if (LI->empty()) 1648 return; 1649 1650 // Visit loops top down and assign them an index. 1651 std::deque<std::pair<const LoopT *, LoopData *>> Q; 1652 for (const LoopT *L : *LI) 1653 Q.emplace_back(L, nullptr); 1654 while (!Q.empty()) { 1655 const LoopT *Loop = Q.front().first; 1656 LoopData *Parent = Q.front().second; 1657 Q.pop_front(); 1658 1659 BlockNode Header = getNode(Loop->getHeader()); 1660 assert(Header.isValid()); 1661 1662 Loops.emplace_back(Parent, Header); 1663 Working[Header.Index].Loop = &Loops.back(); 1664 DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n"); 1665 1666 for (const LoopT *L : *Loop) 1667 Q.emplace_back(L, &Loops.back()); 1668 } 1669 1670 // Visit nodes in reverse post-order and add them to their deepest containing 1671 // loop. 1672 for (size_t Index = 0; Index < RPOT.size(); ++Index) { 1673 // Loop headers have already been mostly mapped. 1674 if (Working[Index].isLoopHeader()) { 1675 LoopData *ContainingLoop = Working[Index].getContainingLoop(); 1676 if (ContainingLoop) 1677 ContainingLoop->Nodes.push_back(Index); 1678 continue; 1679 } 1680 1681 const LoopT *Loop = LI->getLoopFor(RPOT[Index]); 1682 if (!Loop) 1683 continue; 1684 1685 // Add this node to its containing loop's member list. 1686 BlockNode Header = getNode(Loop->getHeader()); 1687 assert(Header.isValid()); 1688 const auto &HeaderData = Working[Header.Index]; 1689 assert(HeaderData.isLoopHeader()); 1690 1691 Working[Index].Loop = HeaderData.Loop; 1692 HeaderData.Loop->Nodes.push_back(Index); 1693 DEBUG(dbgs() << " - loop = " << getBlockName(Header) 1694 << ": member = " << getBlockName(Index) << "\n"); 1695 } 1696} 1697 1698template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() { 1699 // Visit loops with the deepest first, and the top-level loops last. 1700 for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) { 1701 if (computeMassInLoop(*L)) 1702 continue; 1703 auto Next = std::next(L); 1704 computeIrreducibleMass(&*L, L.base()); 1705 L = std::prev(Next); 1706 if (computeMassInLoop(*L)) 1707 continue; 1708 llvm_unreachable("unhandled irreducible control flow"); 1709 } 1710} 1711 1712template <class BT> 1713bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) { 1714 // Compute mass in loop. 1715 DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); 1716 1717 if (Loop.isIrreducible()) { 1718 BlockMass Remaining = BlockMass::getFull(); 1719 for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { 1720 auto &Mass = Working[Loop.Nodes[H].Index].getMass(); 1721 Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H); 1722 Remaining -= Mass; 1723 } 1724 for (const BlockNode &M : Loop.Nodes) 1725 if (!propagateMassToSuccessors(&Loop, M)) 1726 llvm_unreachable("unhandled irreducible control flow"); 1727 } else { 1728 Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); 1729 if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) 1730 llvm_unreachable("irreducible control flow to loop header!?"); 1731 for (const BlockNode &M : Loop.members()) 1732 if (!propagateMassToSuccessors(&Loop, M)) 1733 // Irreducible backedge. 1734 return false; 1735 } 1736 1737 computeLoopScale(Loop); 1738 packageLoop(Loop); 1739 return true; 1740} 1741 1742template <class BT> 1743bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() { 1744 // Compute mass in function. 1745 DEBUG(dbgs() << "compute-mass-in-function\n"); 1746 assert(!Working.empty() && "no blocks in function"); 1747 assert(!Working[0].isLoopHeader() && "entry block is a loop header"); 1748 1749 Working[0].getMass() = BlockMass::getFull(); 1750 for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) { 1751 // Check for nodes that have been packaged. 1752 BlockNode Node = getNode(I); 1753 if (Working[Node.Index].isPackaged()) 1754 continue; 1755 1756 if (!propagateMassToSuccessors(nullptr, Node)) 1757 return false; 1758 } 1759 return true; 1760} 1761 1762template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() { 1763 if (tryToComputeMassInFunction()) 1764 return; 1765 computeIrreducibleMass(nullptr, Loops.begin()); 1766 if (tryToComputeMassInFunction()) 1767 return; 1768 llvm_unreachable("unhandled irreducible control flow"); 1769} 1770 1771/// \note This should be a lambda, but that crashes GCC 4.7. 1772namespace bfi_detail { 1773template <class BT> struct BlockEdgesAdder { 1774 typedef BT BlockT; 1775 typedef BlockFrequencyInfoImplBase::LoopData LoopData; 1776 typedef GraphTraits<const BlockT *> Successor; 1777 1778 const BlockFrequencyInfoImpl<BT> &BFI; 1779 explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI) 1780 : BFI(BFI) {} 1781 void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr, 1782 const LoopData *OuterLoop) { 1783 const BlockT *BB = BFI.RPOT[Irr.Node.Index]; 1784 for (auto I = Successor::child_begin(BB), E = Successor::child_end(BB); 1785 I != E; ++I) 1786 G.addEdge(Irr, BFI.getNode(*I), OuterLoop); 1787 } 1788}; 1789} 1790template <class BT> 1791void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass( 1792 LoopData *OuterLoop, std::list<LoopData>::iterator Insert) { 1793 DEBUG(dbgs() << "analyze-irreducible-in-"; 1794 if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n"; 1795 else dbgs() << "function\n"); 1796 1797 using namespace bfi_detail; 1798 // Ideally, addBlockEdges() would be declared here as a lambda, but that 1799 // crashes GCC 4.7. 1800 BlockEdgesAdder<BT> addBlockEdges(*this); 1801 IrreducibleGraph G(*this, OuterLoop, addBlockEdges); 1802 1803 for (auto &L : analyzeIrreducible(G, OuterLoop, Insert)) 1804 computeMassInLoop(L); 1805 1806 if (!OuterLoop) 1807 return; 1808 updateLoopWithIrreducible(*OuterLoop); 1809} 1810 1811template <class BT> 1812bool 1813BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop, 1814 const BlockNode &Node) { 1815 DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n"); 1816 // Calculate probability for successors. 1817 Distribution Dist; 1818 if (auto *Loop = Working[Node.Index].getPackagedLoop()) { 1819 assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop"); 1820 if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist)) 1821 // Irreducible backedge. 1822 return false; 1823 } else { 1824 const BlockT *BB = getBlock(Node); 1825 for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB); 1826 SI != SE; ++SI) 1827 // Do not dereference SI, or getEdgeWeight() is linear in the number of 1828 // successors. 1829 if (!addToDist(Dist, OuterLoop, Node, getNode(*SI), 1830 BPI->getEdgeWeight(BB, SI))) 1831 // Irreducible backedge. 1832 return false; 1833 } 1834 1835 // Distribute mass to successors, saving exit and backedge data in the 1836 // loop header. 1837 distributeMass(Node, OuterLoop, Dist); 1838 return true; 1839} 1840 1841template <class BT> 1842raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const { 1843 if (!F) 1844 return OS; 1845 OS << "block-frequency-info: " << F->getName() << "\n"; 1846 for (const BlockT &BB : *F) 1847 OS << " - " << bfi_detail::getBlockName(&BB) 1848 << ": float = " << getFloatingBlockFreq(&BB) 1849 << ", int = " << getBlockFreq(&BB).getFrequency() << "\n"; 1850 1851 // Add an extra newline for readability. 1852 OS << "\n"; 1853 return OS; 1854} 1855} 1856 1857#undef DEBUG_TYPE 1858 1859#endif 1860