BlockFrequencyInfoImpl.h revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Shared implementation of BlockFrequency for IR and Machine Instructions.
11// See the documentation below for BlockFrequencyInfoImpl for details.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
16#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
17
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/PostOrderIterator.h"
20#include "llvm/ADT/iterator_range.h"
21#include "llvm/IR/BasicBlock.h"
22#include "llvm/Support/BlockFrequency.h"
23#include "llvm/Support/BranchProbability.h"
24#include "llvm/Support/Debug.h"
25#include "llvm/Support/raw_ostream.h"
26#include <deque>
27#include <list>
28#include <string>
29#include <vector>
30
31#define DEBUG_TYPE "block-freq"
32
33//===----------------------------------------------------------------------===//
34//
35// UnsignedFloat definition.
36//
37// TODO: Make this private to BlockFrequencyInfoImpl or delete.
38//
39//===----------------------------------------------------------------------===//
40namespace llvm {
41
42class UnsignedFloatBase {
43public:
44  static const int32_t MaxExponent = 16383;
45  static const int32_t MinExponent = -16382;
46  static const int DefaultPrecision = 10;
47
48  static void dump(uint64_t D, int16_t E, int Width);
49  static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
50                            unsigned Precision);
51  static std::string toString(uint64_t D, int16_t E, int Width,
52                              unsigned Precision);
53  static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
54  static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
55  static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
56
57  static std::pair<uint64_t, bool> splitSigned(int64_t N) {
58    if (N >= 0)
59      return std::make_pair(N, false);
60    uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
61    return std::make_pair(Unsigned, true);
62  }
63  static int64_t joinSigned(uint64_t U, bool IsNeg) {
64    if (U > uint64_t(INT64_MAX))
65      return IsNeg ? INT64_MIN : INT64_MAX;
66    return IsNeg ? -int64_t(U) : int64_t(U);
67  }
68
69  static int32_t extractLg(const std::pair<int32_t, int> &Lg) {
70    return Lg.first;
71  }
72  static int32_t extractLgFloor(const std::pair<int32_t, int> &Lg) {
73    return Lg.first - (Lg.second > 0);
74  }
75  static int32_t extractLgCeiling(const std::pair<int32_t, int> &Lg) {
76    return Lg.first + (Lg.second < 0);
77  }
78
79  static std::pair<uint64_t, int16_t> divide64(uint64_t L, uint64_t R);
80  static std::pair<uint64_t, int16_t> multiply64(uint64_t L, uint64_t R);
81
82  static int compare(uint64_t L, uint64_t R, int Shift) {
83    assert(Shift >= 0);
84    assert(Shift < 64);
85
86    uint64_t L_adjusted = L >> Shift;
87    if (L_adjusted < R)
88      return -1;
89    if (L_adjusted > R)
90      return 1;
91
92    return L > L_adjusted << Shift ? 1 : 0;
93  }
94};
95
96/// \brief Simple representation of an unsigned floating point.
97///
98/// UnsignedFloat is a unsigned floating point number.  It uses simple
99/// saturation arithmetic, and every operation is well-defined for every value.
100///
101/// The number is split into a signed exponent and unsigned digits.  The number
102/// represented is \c getDigits()*2^getExponent().  In this way, the digits are
103/// much like the mantissa in the x87 long double, but there is no canonical
104/// form, so the same number can be represented by many bit representations
105/// (it's always in "denormal" mode).
106///
107/// UnsignedFloat is templated on the underlying integer type for digits, which
108/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
109///
110/// Unlike builtin floating point types, UnsignedFloat is portable.
111///
112/// Unlike APFloat, UnsignedFloat does not model architecture floating point
113/// behaviour (this should make it a little faster), and implements most
114/// operators (this makes it usable).
115///
116/// UnsignedFloat is totally ordered.  However, there is no canonical form, so
117/// there are multiple representations of most scalars.  E.g.:
118///
119///     UnsignedFloat(8u, 0) == UnsignedFloat(4u, 1)
120///     UnsignedFloat(4u, 1) == UnsignedFloat(2u, 2)
121///     UnsignedFloat(2u, 2) == UnsignedFloat(1u, 3)
122///
123/// UnsignedFloat implements most arithmetic operations.  Precision is kept
124/// where possible.  Uses simple saturation arithmetic, so that operations
125/// saturate to 0.0 or getLargest() rather than under or overflowing.  It has
126/// some extra arithmetic for unit inversion.  0.0/0.0 is defined to be 0.0.
127/// Any other division by 0.0 is defined to be getLargest().
128///
129/// As a convenience for modifying the exponent, left and right shifting are
130/// both implemented, and both interpret negative shifts as positive shifts in
131/// the opposite direction.
132///
133/// Exponents are limited to the range accepted by x87 long double.  This makes
134/// it trivial to add functionality to convert to APFloat (this is already
135/// relied on for the implementation of printing).
136///
137/// The current plan is to gut this and make the necessary parts of it (even
138/// more) private to BlockFrequencyInfo.
139template <class DigitsT> class UnsignedFloat : UnsignedFloatBase {
140public:
141  static_assert(!std::numeric_limits<DigitsT>::is_signed,
142                "only unsigned floats supported");
143
144  typedef DigitsT DigitsType;
145
146private:
147  typedef std::numeric_limits<DigitsType> DigitsLimits;
148
149  static const int Width = sizeof(DigitsType) * 8;
150  static_assert(Width <= 64, "invalid integer width for digits");
151
152private:
153  DigitsType Digits;
154  int16_t Exponent;
155
156public:
157  UnsignedFloat() : Digits(0), Exponent(0) {}
158
159  UnsignedFloat(DigitsType Digits, int16_t Exponent)
160      : Digits(Digits), Exponent(Exponent) {}
161
162private:
163  UnsignedFloat(const std::pair<uint64_t, int16_t> &X)
164      : Digits(X.first), Exponent(X.second) {}
165
166public:
167  static UnsignedFloat getZero() { return UnsignedFloat(0, 0); }
168  static UnsignedFloat getOne() { return UnsignedFloat(1, 0); }
169  static UnsignedFloat getLargest() {
170    return UnsignedFloat(DigitsLimits::max(), MaxExponent);
171  }
172  static UnsignedFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
173  static UnsignedFloat getInverseFloat(uint64_t N) {
174    return getFloat(N).invert();
175  }
176  static UnsignedFloat getFraction(DigitsType N, DigitsType D) {
177    return getQuotient(N, D);
178  }
179
180  int16_t getExponent() const { return Exponent; }
181  DigitsType getDigits() const { return Digits; }
182
183  /// \brief Convert to the given integer type.
184  ///
185  /// Convert to \c IntT using simple saturating arithmetic, truncating if
186  /// necessary.
187  template <class IntT> IntT toInt() const;
188
189  bool isZero() const { return !Digits; }
190  bool isLargest() const { return *this == getLargest(); }
191  bool isOne() const {
192    if (Exponent > 0 || Exponent <= -Width)
193      return false;
194    return Digits == DigitsType(1) << -Exponent;
195  }
196
197  /// \brief The log base 2, rounded.
198  ///
199  /// Get the lg of the scalar.  lg 0 is defined to be INT32_MIN.
200  int32_t lg() const { return extractLg(lgImpl()); }
201
202  /// \brief The log base 2, rounded towards INT32_MIN.
203  ///
204  /// Get the lg floor.  lg 0 is defined to be INT32_MIN.
205  int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
206
207  /// \brief The log base 2, rounded towards INT32_MAX.
208  ///
209  /// Get the lg ceiling.  lg 0 is defined to be INT32_MIN.
210  int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
211
212  bool operator==(const UnsignedFloat &X) const { return compare(X) == 0; }
213  bool operator<(const UnsignedFloat &X) const { return compare(X) < 0; }
214  bool operator!=(const UnsignedFloat &X) const { return compare(X) != 0; }
215  bool operator>(const UnsignedFloat &X) const { return compare(X) > 0; }
216  bool operator<=(const UnsignedFloat &X) const { return compare(X) <= 0; }
217  bool operator>=(const UnsignedFloat &X) const { return compare(X) >= 0; }
218
219  bool operator!() const { return isZero(); }
220
221  /// \brief Convert to a decimal representation in a string.
222  ///
223  /// Convert to a string.  Uses scientific notation for very large/small
224  /// numbers.  Scientific notation is used roughly for numbers outside of the
225  /// range 2^-64 through 2^64.
226  ///
227  /// \c Precision indicates the number of decimal digits of precision to use;
228  /// 0 requests the maximum available.
229  ///
230  /// As a special case to make debugging easier, if the number is small enough
231  /// to convert without scientific notation and has more than \c Precision
232  /// digits before the decimal place, it's printed accurately to the first
233  /// digit past zero.  E.g., assuming 10 digits of precision:
234  ///
235  ///     98765432198.7654... => 98765432198.8
236  ///      8765432198.7654... =>  8765432198.8
237  ///       765432198.7654... =>   765432198.8
238  ///        65432198.7654... =>    65432198.77
239  ///         5432198.7654... =>     5432198.765
240  std::string toString(unsigned Precision = DefaultPrecision) {
241    return UnsignedFloatBase::toString(Digits, Exponent, Width, Precision);
242  }
243
244  /// \brief Print a decimal representation.
245  ///
246  /// Print a string.  See toString for documentation.
247  raw_ostream &print(raw_ostream &OS,
248                     unsigned Precision = DefaultPrecision) const {
249    return UnsignedFloatBase::print(OS, Digits, Exponent, Width, Precision);
250  }
251  void dump() const { return UnsignedFloatBase::dump(Digits, Exponent, Width); }
252
253  UnsignedFloat &operator+=(const UnsignedFloat &X);
254  UnsignedFloat &operator-=(const UnsignedFloat &X);
255  UnsignedFloat &operator*=(const UnsignedFloat &X);
256  UnsignedFloat &operator/=(const UnsignedFloat &X);
257  UnsignedFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; }
258  UnsignedFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; }
259
260private:
261  void shiftLeft(int32_t Shift);
262  void shiftRight(int32_t Shift);
263
264  /// \brief Adjust two floats to have matching exponents.
265  ///
266  /// Adjust \c this and \c X to have matching exponents.  Returns the new \c X
267  /// by value.  Does nothing if \a isZero() for either.
268  ///
269  /// The value that compares smaller will lose precision, and possibly become
270  /// \a isZero().
271  UnsignedFloat matchExponents(UnsignedFloat X);
272
273  /// \brief Increase exponent to match another float.
274  ///
275  /// Increases \c this to have an exponent matching \c X.  May decrease the
276  /// exponent of \c X in the process, and \c this may possibly become \a
277  /// isZero().
278  void increaseExponentToMatch(UnsignedFloat &X, int32_t ExponentDiff);
279
280public:
281  /// \brief Scale a large number accurately.
282  ///
283  /// Scale N (multiply it by this).  Uses full precision multiplication, even
284  /// if Width is smaller than 64, so information is not lost.
285  uint64_t scale(uint64_t N) const;
286  uint64_t scaleByInverse(uint64_t N) const {
287    // TODO: implement directly, rather than relying on inverse.  Inverse is
288    // expensive.
289    return inverse().scale(N);
290  }
291  int64_t scale(int64_t N) const {
292    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
293    return joinSigned(scale(Unsigned.first), Unsigned.second);
294  }
295  int64_t scaleByInverse(int64_t N) const {
296    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
297    return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
298  }
299
300  int compare(const UnsignedFloat &X) const;
301  int compareTo(uint64_t N) const {
302    UnsignedFloat Float = getFloat(N);
303    int Compare = compare(Float);
304    if (Width == 64 || Compare != 0)
305      return Compare;
306
307    // Check for precision loss.  We know *this == RoundTrip.
308    uint64_t RoundTrip = Float.template toInt<uint64_t>();
309    return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
310  }
311  int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
312
313  UnsignedFloat &invert() { return *this = UnsignedFloat::getFloat(1) / *this; }
314  UnsignedFloat inverse() const { return UnsignedFloat(*this).invert(); }
315
316private:
317  static UnsignedFloat getProduct(DigitsType L, DigitsType R);
318  static UnsignedFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
319
320  std::pair<int32_t, int> lgImpl() const;
321  static int countLeadingZerosWidth(DigitsType Digits) {
322    if (Width == 64)
323      return countLeadingZeros64(Digits);
324    if (Width == 32)
325      return countLeadingZeros32(Digits);
326    return countLeadingZeros32(Digits) + Width - 32;
327  }
328
329  static UnsignedFloat adjustToWidth(uint64_t N, int32_t S) {
330    assert(S >= MinExponent);
331    assert(S <= MaxExponent);
332    if (Width == 64 || N <= DigitsLimits::max())
333      return UnsignedFloat(N, S);
334
335    // Shift right.
336    int Shift = 64 - Width - countLeadingZeros64(N);
337    DigitsType Shifted = N >> Shift;
338
339    // Round.
340    assert(S + Shift <= MaxExponent);
341    return getRounded(UnsignedFloat(Shifted, S + Shift),
342                      N & UINT64_C(1) << (Shift - 1));
343  }
344
345  static UnsignedFloat getRounded(UnsignedFloat P, bool Round) {
346    if (!Round)
347      return P;
348    if (P.Digits == DigitsLimits::max())
349      // Careful of overflow in the exponent.
350      return UnsignedFloat(1, P.Exponent) <<= Width;
351    return UnsignedFloat(P.Digits + 1, P.Exponent);
352  }
353};
354
355#define UNSIGNED_FLOAT_BOP(op, base)                                           \
356  template <class DigitsT>                                                     \
357  UnsignedFloat<DigitsT> operator op(const UnsignedFloat<DigitsT> &L,          \
358                                     const UnsignedFloat<DigitsT> &R) {        \
359    return UnsignedFloat<DigitsT>(L) base R;                                   \
360  }
361UNSIGNED_FLOAT_BOP(+, += )
362UNSIGNED_FLOAT_BOP(-, -= )
363UNSIGNED_FLOAT_BOP(*, *= )
364UNSIGNED_FLOAT_BOP(/, /= )
365UNSIGNED_FLOAT_BOP(<<, <<= )
366UNSIGNED_FLOAT_BOP(>>, >>= )
367#undef UNSIGNED_FLOAT_BOP
368
369template <class DigitsT>
370raw_ostream &operator<<(raw_ostream &OS, const UnsignedFloat<DigitsT> &X) {
371  return X.print(OS, 10);
372}
373
374#define UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, T1, T2)                             \
375  template <class DigitsT>                                                     \
376  bool operator op(const UnsignedFloat<DigitsT> &L, T1 R) {                    \
377    return L.compareTo(T2(R)) op 0;                                            \
378  }                                                                            \
379  template <class DigitsT>                                                     \
380  bool operator op(T1 L, const UnsignedFloat<DigitsT> &R) {                    \
381    return 0 op R.compareTo(T2(L));                                            \
382  }
383#define UNSIGNED_FLOAT_COMPARE_TO(op)                                          \
384  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t)                       \
385  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t)                       \
386  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t)                         \
387  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t)
388UNSIGNED_FLOAT_COMPARE_TO(< )
389UNSIGNED_FLOAT_COMPARE_TO(> )
390UNSIGNED_FLOAT_COMPARE_TO(== )
391UNSIGNED_FLOAT_COMPARE_TO(!= )
392UNSIGNED_FLOAT_COMPARE_TO(<= )
393UNSIGNED_FLOAT_COMPARE_TO(>= )
394#undef UNSIGNED_FLOAT_COMPARE_TO
395#undef UNSIGNED_FLOAT_COMPARE_TO_TYPE
396
397template <class DigitsT>
398uint64_t UnsignedFloat<DigitsT>::scale(uint64_t N) const {
399  if (Width == 64 || N <= DigitsLimits::max())
400    return (getFloat(N) * *this).template toInt<uint64_t>();
401
402  // Defer to the 64-bit version.
403  return UnsignedFloat<uint64_t>(Digits, Exponent).scale(N);
404}
405
406template <class DigitsT>
407UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getProduct(DigitsType L,
408                                                          DigitsType R) {
409  // Check for zero.
410  if (!L || !R)
411    return getZero();
412
413  // Check for numbers that we can compute with 64-bit math.
414  if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX))
415    return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
416
417  // Do the full thing.
418  return UnsignedFloat(multiply64(L, R));
419}
420template <class DigitsT>
421UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getQuotient(DigitsType Dividend,
422                                                           DigitsType Divisor) {
423  // Check for zero.
424  if (!Dividend)
425    return getZero();
426  if (!Divisor)
427    return getLargest();
428
429  if (Width == 64)
430    return UnsignedFloat(divide64(Dividend, Divisor));
431
432  // We can compute this with 64-bit math.
433  int Shift = countLeadingZeros64(Dividend);
434  uint64_t Shifted = uint64_t(Dividend) << Shift;
435  uint64_t Quotient = Shifted / Divisor;
436
437  // If Quotient needs to be shifted, then adjustToWidth will round.
438  if (Quotient > DigitsLimits::max())
439    return adjustToWidth(Quotient, -Shift);
440
441  // Round based on the value of the next bit.
442  return getRounded(UnsignedFloat(Quotient, -Shift),
443                    Shifted % Divisor >= getHalf(Divisor));
444}
445
446template <class DigitsT>
447template <class IntT>
448IntT UnsignedFloat<DigitsT>::toInt() const {
449  typedef std::numeric_limits<IntT> Limits;
450  if (*this < 1)
451    return 0;
452  if (*this >= Limits::max())
453    return Limits::max();
454
455  IntT N = Digits;
456  if (Exponent > 0) {
457    assert(size_t(Exponent) < sizeof(IntT) * 8);
458    return N << Exponent;
459  }
460  if (Exponent < 0) {
461    assert(size_t(-Exponent) < sizeof(IntT) * 8);
462    return N >> -Exponent;
463  }
464  return N;
465}
466
467template <class DigitsT>
468std::pair<int32_t, int> UnsignedFloat<DigitsT>::lgImpl() const {
469  if (isZero())
470    return std::make_pair(INT32_MIN, 0);
471
472  // Get the floor of the lg of Digits.
473  int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
474
475  // Get the floor of the lg of this.
476  int32_t Floor = Exponent + LocalFloor;
477  if (Digits == UINT64_C(1) << LocalFloor)
478    return std::make_pair(Floor, 0);
479
480  // Round based on the next digit.
481  assert(LocalFloor >= 1);
482  bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
483  return std::make_pair(Floor + Round, Round ? 1 : -1);
484}
485
486template <class DigitsT>
487UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::matchExponents(UnsignedFloat X) {
488  if (isZero() || X.isZero() || Exponent == X.Exponent)
489    return X;
490
491  int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent);
492  if (Diff > 0)
493    increaseExponentToMatch(X, Diff);
494  else
495    X.increaseExponentToMatch(*this, -Diff);
496  return X;
497}
498template <class DigitsT>
499void UnsignedFloat<DigitsT>::increaseExponentToMatch(UnsignedFloat &X,
500                                                     int32_t ExponentDiff) {
501  assert(ExponentDiff > 0);
502  if (ExponentDiff >= 2 * Width) {
503    *this = getZero();
504    return;
505  }
506
507  // Use up any leading zeros on X, and then shift this.
508  int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
509  assert(ShiftX < Width);
510
511  int32_t ShiftThis = ExponentDiff - ShiftX;
512  if (ShiftThis >= Width) {
513    *this = getZero();
514    return;
515  }
516
517  X.Digits <<= ShiftX;
518  X.Exponent -= ShiftX;
519  Digits >>= ShiftThis;
520  Exponent += ShiftThis;
521  return;
522}
523
524template <class DigitsT>
525UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
526operator+=(const UnsignedFloat &X) {
527  if (isLargest() || X.isZero())
528    return *this;
529  if (isZero() || X.isLargest())
530    return *this = X;
531
532  // Normalize exponents.
533  UnsignedFloat Scaled = matchExponents(X);
534
535  // Check for zero again.
536  if (isZero())
537    return *this = Scaled;
538  if (Scaled.isZero())
539    return *this;
540
541  // Compute sum.
542  DigitsType Sum = Digits + Scaled.Digits;
543  bool DidOverflow = Sum < Digits;
544  Digits = Sum;
545  if (!DidOverflow)
546    return *this;
547
548  if (Exponent == MaxExponent)
549    return *this = getLargest();
550
551  ++Exponent;
552  Digits = UINT64_C(1) << (Width - 1) | Digits >> 1;
553
554  return *this;
555}
556template <class DigitsT>
557UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
558operator-=(const UnsignedFloat &X) {
559  if (X.isZero())
560    return *this;
561  if (*this <= X)
562    return *this = getZero();
563
564  // Normalize exponents.
565  UnsignedFloat Scaled = matchExponents(X);
566  assert(Digits >= Scaled.Digits);
567
568  // Compute difference.
569  if (!Scaled.isZero()) {
570    Digits -= Scaled.Digits;
571    return *this;
572  }
573
574  // Check if X just barely lost its last bit.  E.g., for 32-bit:
575  //
576  //   1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
577  if (*this == UnsignedFloat(1, X.lgFloor() + Width)) {
578    Digits = DigitsType(0) - 1;
579    --Exponent;
580  }
581  return *this;
582}
583template <class DigitsT>
584UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
585operator*=(const UnsignedFloat &X) {
586  if (isZero())
587    return *this;
588  if (X.isZero())
589    return *this = X;
590
591  // Save the exponents.
592  int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
593
594  // Get the raw product.
595  *this = getProduct(Digits, X.Digits);
596
597  // Combine with exponents.
598  return *this <<= Exponents;
599}
600template <class DigitsT>
601UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
602operator/=(const UnsignedFloat &X) {
603  if (isZero())
604    return *this;
605  if (X.isZero())
606    return *this = getLargest();
607
608  // Save the exponents.
609  int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent);
610
611  // Get the raw quotient.
612  *this = getQuotient(Digits, X.Digits);
613
614  // Combine with exponents.
615  return *this <<= Exponents;
616}
617template <class DigitsT>
618void UnsignedFloat<DigitsT>::shiftLeft(int32_t Shift) {
619  if (!Shift || isZero())
620    return;
621  assert(Shift != INT32_MIN);
622  if (Shift < 0) {
623    shiftRight(-Shift);
624    return;
625  }
626
627  // Shift as much as we can in the exponent.
628  int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
629  Exponent += ExponentShift;
630  if (ExponentShift == Shift)
631    return;
632
633  // Check this late, since it's rare.
634  if (isLargest())
635    return;
636
637  // Shift the digits themselves.
638  Shift -= ExponentShift;
639  if (Shift > countLeadingZerosWidth(Digits)) {
640    // Saturate.
641    *this = getLargest();
642    return;
643  }
644
645  Digits <<= Shift;
646  return;
647}
648
649template <class DigitsT>
650void UnsignedFloat<DigitsT>::shiftRight(int32_t Shift) {
651  if (!Shift || isZero())
652    return;
653  assert(Shift != INT32_MIN);
654  if (Shift < 0) {
655    shiftLeft(-Shift);
656    return;
657  }
658
659  // Shift as much as we can in the exponent.
660  int32_t ExponentShift = std::min(Shift, Exponent - MinExponent);
661  Exponent -= ExponentShift;
662  if (ExponentShift == Shift)
663    return;
664
665  // Shift the digits themselves.
666  Shift -= ExponentShift;
667  if (Shift >= Width) {
668    // Saturate.
669    *this = getZero();
670    return;
671  }
672
673  Digits >>= Shift;
674  return;
675}
676
677template <class DigitsT>
678int UnsignedFloat<DigitsT>::compare(const UnsignedFloat &X) const {
679  // Check for zero.
680  if (isZero())
681    return X.isZero() ? 0 : -1;
682  if (X.isZero())
683    return 1;
684
685  // Check for the scale.  Use lgFloor to be sure that the exponent difference
686  // is always lower than 64.
687  int32_t lgL = lgFloor(), lgR = X.lgFloor();
688  if (lgL != lgR)
689    return lgL < lgR ? -1 : 1;
690
691  // Compare digits.
692  if (Exponent < X.Exponent)
693    return UnsignedFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
694
695  return -UnsignedFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
696}
697
698template <class T> struct isPodLike<UnsignedFloat<T>> {
699  static const bool value = true;
700};
701}
702
703//===----------------------------------------------------------------------===//
704//
705// BlockMass definition.
706//
707// TODO: Make this private to BlockFrequencyInfoImpl or delete.
708//
709//===----------------------------------------------------------------------===//
710namespace llvm {
711
712/// \brief Mass of a block.
713///
714/// This class implements a sort of fixed-point fraction always between 0.0 and
715/// 1.0.  getMass() == UINT64_MAX indicates a value of 1.0.
716///
717/// Masses can be added and subtracted.  Simple saturation arithmetic is used,
718/// so arithmetic operations never overflow or underflow.
719///
720/// Masses can be multiplied.  Multiplication treats full mass as 1.0 and uses
721/// an inexpensive floating-point algorithm that's off-by-one (almost, but not
722/// quite, maximum precision).
723///
724/// Masses can be scaled by \a BranchProbability at maximum precision.
725class BlockMass {
726  uint64_t Mass;
727
728public:
729  BlockMass() : Mass(0) {}
730  explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
731
732  static BlockMass getEmpty() { return BlockMass(); }
733  static BlockMass getFull() { return BlockMass(UINT64_MAX); }
734
735  uint64_t getMass() const { return Mass; }
736
737  bool isFull() const { return Mass == UINT64_MAX; }
738  bool isEmpty() const { return !Mass; }
739
740  bool operator!() const { return isEmpty(); }
741
742  /// \brief Add another mass.
743  ///
744  /// Adds another mass, saturating at \a isFull() rather than overflowing.
745  BlockMass &operator+=(const BlockMass &X) {
746    uint64_t Sum = Mass + X.Mass;
747    Mass = Sum < Mass ? UINT64_MAX : Sum;
748    return *this;
749  }
750
751  /// \brief Subtract another mass.
752  ///
753  /// Subtracts another mass, saturating at \a isEmpty() rather than
754  /// undeflowing.
755  BlockMass &operator-=(const BlockMass &X) {
756    uint64_t Diff = Mass - X.Mass;
757    Mass = Diff > Mass ? 0 : Diff;
758    return *this;
759  }
760
761  BlockMass &operator*=(const BranchProbability &P) {
762    Mass = P.scale(Mass);
763    return *this;
764  }
765
766  bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
767  bool operator!=(const BlockMass &X) const { return Mass != X.Mass; }
768  bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; }
769  bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; }
770  bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
771  bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
772
773  /// \brief Convert to floating point.
774  ///
775  /// Convert to a float.  \a isFull() gives 1.0, while \a isEmpty() gives
776  /// slightly above 0.0.
777  UnsignedFloat<uint64_t> toFloat() const;
778
779  void dump() const;
780  raw_ostream &print(raw_ostream &OS) const;
781};
782
783inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
784  return BlockMass(L) += R;
785}
786inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
787  return BlockMass(L) -= R;
788}
789inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
790  return BlockMass(L) *= R;
791}
792inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
793  return BlockMass(R) *= L;
794}
795
796inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
797  return X.print(OS);
798}
799
800template <> struct isPodLike<BlockMass> {
801  static const bool value = true;
802};
803}
804
805//===----------------------------------------------------------------------===//
806//
807// BlockFrequencyInfoImpl definition.
808//
809//===----------------------------------------------------------------------===//
810namespace llvm {
811
812class BasicBlock;
813class BranchProbabilityInfo;
814class Function;
815class Loop;
816class LoopInfo;
817class MachineBasicBlock;
818class MachineBranchProbabilityInfo;
819class MachineFunction;
820class MachineLoop;
821class MachineLoopInfo;
822
823namespace bfi_detail {
824struct IrreducibleGraph;
825
826// This is part of a workaround for a GCC 4.7 crash on lambdas.
827template <class BT> struct BlockEdgesAdder;
828}
829
830/// \brief Base class for BlockFrequencyInfoImpl
831///
832/// BlockFrequencyInfoImplBase has supporting data structures and some
833/// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
834/// the block type (or that call such algorithms) are skipped here.
835///
836/// Nevertheless, the majority of the overall algorithm documention lives with
837/// BlockFrequencyInfoImpl.  See there for details.
838class BlockFrequencyInfoImplBase {
839public:
840  typedef UnsignedFloat<uint64_t> Float;
841
842  /// \brief Representative of a block.
843  ///
844  /// This is a simple wrapper around an index into the reverse-post-order
845  /// traversal of the blocks.
846  ///
847  /// Unlike a block pointer, its order has meaning (location in the
848  /// topological sort) and it's class is the same regardless of block type.
849  struct BlockNode {
850    typedef uint32_t IndexType;
851    IndexType Index;
852
853    bool operator==(const BlockNode &X) const { return Index == X.Index; }
854    bool operator!=(const BlockNode &X) const { return Index != X.Index; }
855    bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
856    bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
857    bool operator<(const BlockNode &X) const { return Index < X.Index; }
858    bool operator>(const BlockNode &X) const { return Index > X.Index; }
859
860    BlockNode() : Index(UINT32_MAX) {}
861    BlockNode(IndexType Index) : Index(Index) {}
862
863    bool isValid() const { return Index <= getMaxIndex(); }
864    static size_t getMaxIndex() { return UINT32_MAX - 1; }
865  };
866
867  /// \brief Stats about a block itself.
868  struct FrequencyData {
869    Float Floating;
870    uint64_t Integer;
871  };
872
873  /// \brief Data about a loop.
874  ///
875  /// Contains the data necessary to represent represent a loop as a
876  /// pseudo-node once it's packaged.
877  struct LoopData {
878    typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
879    typedef SmallVector<BlockNode, 4> NodeList;
880    LoopData *Parent;       ///< The parent loop.
881    bool IsPackaged;        ///< Whether this has been packaged.
882    uint32_t NumHeaders;    ///< Number of headers.
883    ExitMap Exits;          ///< Successor edges (and weights).
884    NodeList Nodes;         ///< Header and the members of the loop.
885    BlockMass BackedgeMass; ///< Mass returned to loop header.
886    BlockMass Mass;
887    Float Scale;
888
889    LoopData(LoopData *Parent, const BlockNode &Header)
890        : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
891    template <class It1, class It2>
892    LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
893             It2 LastOther)
894        : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
895      NumHeaders = Nodes.size();
896      Nodes.insert(Nodes.end(), FirstOther, LastOther);
897    }
898    bool isHeader(const BlockNode &Node) const {
899      if (isIrreducible())
900        return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
901                                  Node);
902      return Node == Nodes[0];
903    }
904    BlockNode getHeader() const { return Nodes[0]; }
905    bool isIrreducible() const { return NumHeaders > 1; }
906
907    NodeList::const_iterator members_begin() const {
908      return Nodes.begin() + NumHeaders;
909    }
910    NodeList::const_iterator members_end() const { return Nodes.end(); }
911    iterator_range<NodeList::const_iterator> members() const {
912      return make_range(members_begin(), members_end());
913    }
914  };
915
916  /// \brief Index of loop information.
917  struct WorkingData {
918    BlockNode Node; ///< This node.
919    LoopData *Loop; ///< The loop this block is inside.
920    BlockMass Mass; ///< Mass distribution from the entry block.
921
922    WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {}
923
924    bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
925    bool isDoubleLoopHeader() const {
926      return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
927             Loop->Parent->isHeader(Node);
928    }
929
930    LoopData *getContainingLoop() const {
931      if (!isLoopHeader())
932        return Loop;
933      if (!isDoubleLoopHeader())
934        return Loop->Parent;
935      return Loop->Parent->Parent;
936    }
937
938    /// \brief Resolve a node to its representative.
939    ///
940    /// Get the node currently representing Node, which could be a containing
941    /// loop.
942    ///
943    /// This function should only be called when distributing mass.  As long as
944    /// there are no irreducilbe edges to Node, then it will have complexity
945    /// O(1) in this context.
946    ///
947    /// In general, the complexity is O(L), where L is the number of loop
948    /// headers Node has been packaged into.  Since this method is called in
949    /// the context of distributing mass, L will be the number of loop headers
950    /// an early exit edge jumps out of.
951    BlockNode getResolvedNode() const {
952      auto L = getPackagedLoop();
953      return L ? L->getHeader() : Node;
954    }
955    LoopData *getPackagedLoop() const {
956      if (!Loop || !Loop->IsPackaged)
957        return nullptr;
958      auto L = Loop;
959      while (L->Parent && L->Parent->IsPackaged)
960        L = L->Parent;
961      return L;
962    }
963
964    /// \brief Get the appropriate mass for a node.
965    ///
966    /// Get appropriate mass for Node.  If Node is a loop-header (whose loop
967    /// has been packaged), returns the mass of its pseudo-node.  If it's a
968    /// node inside a packaged loop, it returns the loop's mass.
969    BlockMass &getMass() {
970      if (!isAPackage())
971        return Mass;
972      if (!isADoublePackage())
973        return Loop->Mass;
974      return Loop->Parent->Mass;
975    }
976
977    /// \brief Has ContainingLoop been packaged up?
978    bool isPackaged() const { return getResolvedNode() != Node; }
979    /// \brief Has Loop been packaged up?
980    bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
981    /// \brief Has Loop been packaged up twice?
982    bool isADoublePackage() const {
983      return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
984    }
985  };
986
987  /// \brief Unscaled probability weight.
988  ///
989  /// Probability weight for an edge in the graph (including the
990  /// successor/target node).
991  ///
992  /// All edges in the original function are 32-bit.  However, exit edges from
993  /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
994  /// space in general.
995  ///
996  /// In addition to the raw weight amount, Weight stores the type of the edge
997  /// in the current context (i.e., the context of the loop being processed).
998  /// Is this a local edge within the loop, an exit from the loop, or a
999  /// backedge to the loop header?
1000  struct Weight {
1001    enum DistType { Local, Exit, Backedge };
1002    DistType Type;
1003    BlockNode TargetNode;
1004    uint64_t Amount;
1005    Weight() : Type(Local), Amount(0) {}
1006  };
1007
1008  /// \brief Distribution of unscaled probability weight.
1009  ///
1010  /// Distribution of unscaled probability weight to a set of successors.
1011  ///
1012  /// This class collates the successor edge weights for later processing.
1013  ///
1014  /// \a DidOverflow indicates whether \a Total did overflow while adding to
1015  /// the distribution.  It should never overflow twice.
1016  struct Distribution {
1017    typedef SmallVector<Weight, 4> WeightList;
1018    WeightList Weights;    ///< Individual successor weights.
1019    uint64_t Total;        ///< Sum of all weights.
1020    bool DidOverflow;      ///< Whether \a Total did overflow.
1021
1022    Distribution() : Total(0), DidOverflow(false) {}
1023    void addLocal(const BlockNode &Node, uint64_t Amount) {
1024      add(Node, Amount, Weight::Local);
1025    }
1026    void addExit(const BlockNode &Node, uint64_t Amount) {
1027      add(Node, Amount, Weight::Exit);
1028    }
1029    void addBackedge(const BlockNode &Node, uint64_t Amount) {
1030      add(Node, Amount, Weight::Backedge);
1031    }
1032
1033    /// \brief Normalize the distribution.
1034    ///
1035    /// Combines multiple edges to the same \a Weight::TargetNode and scales
1036    /// down so that \a Total fits into 32-bits.
1037    ///
1038    /// This is linear in the size of \a Weights.  For the vast majority of
1039    /// cases, adjacent edge weights are combined by sorting WeightList and
1040    /// combining adjacent weights.  However, for very large edge lists an
1041    /// auxiliary hash table is used.
1042    void normalize();
1043
1044  private:
1045    void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
1046  };
1047
1048  /// \brief Data about each block.  This is used downstream.
1049  std::vector<FrequencyData> Freqs;
1050
1051  /// \brief Loop data: see initializeLoops().
1052  std::vector<WorkingData> Working;
1053
1054  /// \brief Indexed information about loops.
1055  std::list<LoopData> Loops;
1056
1057  /// \brief Add all edges out of a packaged loop to the distribution.
1058  ///
1059  /// Adds all edges from LocalLoopHead to Dist.  Calls addToDist() to add each
1060  /// successor edge.
1061  ///
1062  /// \return \c true unless there's an irreducible backedge.
1063  bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
1064                               Distribution &Dist);
1065
1066  /// \brief Add an edge to the distribution.
1067  ///
1068  /// Adds an edge to Succ to Dist.  If \c LoopHead.isValid(), then whether the
1069  /// edge is local/exit/backedge is in the context of LoopHead.  Otherwise,
1070  /// every edge should be a local edge (since all the loops are packaged up).
1071  ///
1072  /// \return \c true unless aborted due to an irreducible backedge.
1073  bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
1074                 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
1075
1076  LoopData &getLoopPackage(const BlockNode &Head) {
1077    assert(Head.Index < Working.size());
1078    assert(Working[Head.Index].isLoopHeader());
1079    return *Working[Head.Index].Loop;
1080  }
1081
1082  /// \brief Analyze irreducible SCCs.
1083  ///
1084  /// Separate irreducible SCCs from \c G, which is an explict graph of \c
1085  /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
1086  /// Insert them into \a Loops before \c Insert.
1087  ///
1088  /// \return the \c LoopData nodes representing the irreducible SCCs.
1089  iterator_range<std::list<LoopData>::iterator>
1090  analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
1091                     std::list<LoopData>::iterator Insert);
1092
1093  /// \brief Update a loop after packaging irreducible SCCs inside of it.
1094  ///
1095  /// Update \c OuterLoop.  Before finding irreducible control flow, it was
1096  /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
1097  /// LoopData::BackedgeMass need to be reset.  Also, nodes that were packaged
1098  /// up need to be removed from \a OuterLoop::Nodes.
1099  void updateLoopWithIrreducible(LoopData &OuterLoop);
1100
1101  /// \brief Distribute mass according to a distribution.
1102  ///
1103  /// Distributes the mass in Source according to Dist.  If LoopHead.isValid(),
1104  /// backedges and exits are stored in its entry in Loops.
1105  ///
1106  /// Mass is distributed in parallel from two copies of the source mass.
1107  void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
1108                      Distribution &Dist);
1109
1110  /// \brief Compute the loop scale for a loop.
1111  void computeLoopScale(LoopData &Loop);
1112
1113  /// \brief Package up a loop.
1114  void packageLoop(LoopData &Loop);
1115
1116  /// \brief Unwrap loops.
1117  void unwrapLoops();
1118
1119  /// \brief Finalize frequency metrics.
1120  ///
1121  /// Calculates final frequencies and cleans up no-longer-needed data
1122  /// structures.
1123  void finalizeMetrics();
1124
1125  /// \brief Clear all memory.
1126  void clear();
1127
1128  virtual std::string getBlockName(const BlockNode &Node) const;
1129  std::string getLoopName(const LoopData &Loop) const;
1130
1131  virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
1132  void dump() const { print(dbgs()); }
1133
1134  Float getFloatingBlockFreq(const BlockNode &Node) const;
1135
1136  BlockFrequency getBlockFreq(const BlockNode &Node) const;
1137
1138  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
1139  raw_ostream &printBlockFreq(raw_ostream &OS,
1140                              const BlockFrequency &Freq) const;
1141
1142  uint64_t getEntryFreq() const {
1143    assert(!Freqs.empty());
1144    return Freqs[0].Integer;
1145  }
1146  /// \brief Virtual destructor.
1147  ///
1148  /// Need a virtual destructor to mask the compiler warning about
1149  /// getBlockName().
1150  virtual ~BlockFrequencyInfoImplBase() {}
1151};
1152
1153namespace bfi_detail {
1154template <class BlockT> struct TypeMap {};
1155template <> struct TypeMap<BasicBlock> {
1156  typedef BasicBlock BlockT;
1157  typedef Function FunctionT;
1158  typedef BranchProbabilityInfo BranchProbabilityInfoT;
1159  typedef Loop LoopT;
1160  typedef LoopInfo LoopInfoT;
1161};
1162template <> struct TypeMap<MachineBasicBlock> {
1163  typedef MachineBasicBlock BlockT;
1164  typedef MachineFunction FunctionT;
1165  typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
1166  typedef MachineLoop LoopT;
1167  typedef MachineLoopInfo LoopInfoT;
1168};
1169
1170/// \brief Get the name of a MachineBasicBlock.
1171///
1172/// Get the name of a MachineBasicBlock.  It's templated so that including from
1173/// CodeGen is unnecessary (that would be a layering issue).
1174///
1175/// This is used mainly for debug output.  The name is similar to
1176/// MachineBasicBlock::getFullName(), but skips the name of the function.
1177template <class BlockT> std::string getBlockName(const BlockT *BB) {
1178  assert(BB && "Unexpected nullptr");
1179  auto MachineName = "BB" + Twine(BB->getNumber());
1180  if (BB->getBasicBlock())
1181    return (MachineName + "[" + BB->getName() + "]").str();
1182  return MachineName.str();
1183}
1184/// \brief Get the name of a BasicBlock.
1185template <> inline std::string getBlockName(const BasicBlock *BB) {
1186  assert(BB && "Unexpected nullptr");
1187  return BB->getName().str();
1188}
1189
1190/// \brief Graph of irreducible control flow.
1191///
1192/// This graph is used for determining the SCCs in a loop (or top-level
1193/// function) that has irreducible control flow.
1194///
1195/// During the block frequency algorithm, the local graphs are defined in a
1196/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
1197/// graphs for most edges, but getting others from \a LoopData::ExitMap.  The
1198/// latter only has successor information.
1199///
1200/// \a IrreducibleGraph makes this graph explicit.  It's in a form that can use
1201/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
1202/// and it explicitly lists predecessors and successors.  The initialization
1203/// that relies on \c MachineBasicBlock is defined in the header.
1204struct IrreducibleGraph {
1205  typedef BlockFrequencyInfoImplBase BFIBase;
1206
1207  BFIBase &BFI;
1208
1209  typedef BFIBase::BlockNode BlockNode;
1210  struct IrrNode {
1211    BlockNode Node;
1212    unsigned NumIn;
1213    std::deque<const IrrNode *> Edges;
1214    IrrNode(const BlockNode &Node) : Node(Node), NumIn(0) {}
1215
1216    typedef std::deque<const IrrNode *>::const_iterator iterator;
1217    iterator pred_begin() const { return Edges.begin(); }
1218    iterator succ_begin() const { return Edges.begin() + NumIn; }
1219    iterator pred_end() const { return succ_begin(); }
1220    iterator succ_end() const { return Edges.end(); }
1221  };
1222  BlockNode Start;
1223  const IrrNode *StartIrr;
1224  std::vector<IrrNode> Nodes;
1225  SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
1226
1227  /// \brief Construct an explicit graph containing irreducible control flow.
1228  ///
1229  /// Construct an explicit graph of the control flow in \c OuterLoop (or the
1230  /// top-level function, if \c OuterLoop is \c nullptr).  Uses \c
1231  /// addBlockEdges to add block successors that have not been packaged into
1232  /// loops.
1233  ///
1234  /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
1235  /// user of this.
1236  template <class BlockEdgesAdder>
1237  IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
1238                   BlockEdgesAdder addBlockEdges)
1239      : BFI(BFI), StartIrr(nullptr) {
1240    initialize(OuterLoop, addBlockEdges);
1241  }
1242
1243  template <class BlockEdgesAdder>
1244  void initialize(const BFIBase::LoopData *OuterLoop,
1245                  BlockEdgesAdder addBlockEdges);
1246  void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
1247  void addNodesInFunction();
1248  void addNode(const BlockNode &Node) {
1249    Nodes.emplace_back(Node);
1250    BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
1251  }
1252  void indexNodes();
1253  template <class BlockEdgesAdder>
1254  void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
1255                BlockEdgesAdder addBlockEdges);
1256  void addEdge(IrrNode &Irr, const BlockNode &Succ,
1257               const BFIBase::LoopData *OuterLoop);
1258};
1259template <class BlockEdgesAdder>
1260void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
1261                                  BlockEdgesAdder addBlockEdges) {
1262  if (OuterLoop) {
1263    addNodesInLoop(*OuterLoop);
1264    for (auto N : OuterLoop->Nodes)
1265      addEdges(N, OuterLoop, addBlockEdges);
1266  } else {
1267    addNodesInFunction();
1268    for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
1269      addEdges(Index, OuterLoop, addBlockEdges);
1270  }
1271  StartIrr = Lookup[Start.Index];
1272}
1273template <class BlockEdgesAdder>
1274void IrreducibleGraph::addEdges(const BlockNode &Node,
1275                                const BFIBase::LoopData *OuterLoop,
1276                                BlockEdgesAdder addBlockEdges) {
1277  auto L = Lookup.find(Node.Index);
1278  if (L == Lookup.end())
1279    return;
1280  IrrNode &Irr = *L->second;
1281  const auto &Working = BFI.Working[Node.Index];
1282
1283  if (Working.isAPackage())
1284    for (const auto &I : Working.Loop->Exits)
1285      addEdge(Irr, I.first, OuterLoop);
1286  else
1287    addBlockEdges(*this, Irr, OuterLoop);
1288}
1289}
1290
1291/// \brief Shared implementation for block frequency analysis.
1292///
1293/// This is a shared implementation of BlockFrequencyInfo and
1294/// MachineBlockFrequencyInfo, and calculates the relative frequencies of
1295/// blocks.
1296///
1297/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
1298/// which is called the header.  A given loop, L, can have sub-loops, which are
1299/// loops within the subgraph of L that exclude its header.  (A "trivial" SCC
1300/// consists of a single block that does not have a self-edge.)
1301///
1302/// In addition to loops, this algorithm has limited support for irreducible
1303/// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
1304/// discovered on they fly, and modelled as loops with multiple headers.
1305///
1306/// The headers of irreducible sub-SCCs consist of its entry blocks and all
1307/// nodes that are targets of a backedge within it (excluding backedges within
1308/// true sub-loops).  Block frequency calculations act as if a block is
1309/// inserted that intercepts all the edges to the headers.  All backedges and
1310/// entries point to this block.  Its successors are the headers, which split
1311/// the frequency evenly.
1312///
1313/// This algorithm leverages BlockMass and UnsignedFloat to maintain precision,
1314/// separates mass distribution from loop scaling, and dithers to eliminate
1315/// probability mass loss.
1316///
1317/// The implementation is split between BlockFrequencyInfoImpl, which knows the
1318/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
1319/// BlockFrequencyInfoImplBase, which doesn't.  The base class uses \a
1320/// BlockNode, a wrapper around a uint32_t.  BlockNode is numbered from 0 in
1321/// reverse-post order.  This gives two advantages:  it's easy to compare the
1322/// relative ordering of two nodes, and maps keyed on BlockT can be represented
1323/// by vectors.
1324///
1325/// This algorithm is O(V+E), unless there is irreducible control flow, in
1326/// which case it's O(V*E) in the worst case.
1327///
1328/// These are the main stages:
1329///
1330///  0. Reverse post-order traversal (\a initializeRPOT()).
1331///
1332///     Run a single post-order traversal and save it (in reverse) in RPOT.
1333///     All other stages make use of this ordering.  Save a lookup from BlockT
1334///     to BlockNode (the index into RPOT) in Nodes.
1335///
1336///  1. Loop initialization (\a initializeLoops()).
1337///
1338///     Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
1339///     the algorithm.  In particular, store the immediate members of each loop
1340///     in reverse post-order.
1341///
1342///  2. Calculate mass and scale in loops (\a computeMassInLoops()).
1343///
1344///     For each loop (bottom-up), distribute mass through the DAG resulting
1345///     from ignoring backedges and treating sub-loops as a single pseudo-node.
1346///     Track the backedge mass distributed to the loop header, and use it to
1347///     calculate the loop scale (number of loop iterations).  Immediate
1348///     members that represent sub-loops will already have been visited and
1349///     packaged into a pseudo-node.
1350///
1351///     Distributing mass in a loop is a reverse-post-order traversal through
1352///     the loop.  Start by assigning full mass to the Loop header.  For each
1353///     node in the loop:
1354///
1355///         - Fetch and categorize the weight distribution for its successors.
1356///           If this is a packaged-subloop, the weight distribution is stored
1357///           in \a LoopData::Exits.  Otherwise, fetch it from
1358///           BranchProbabilityInfo.
1359///
1360///         - Each successor is categorized as \a Weight::Local, a local edge
1361///           within the current loop, \a Weight::Backedge, a backedge to the
1362///           loop header, or \a Weight::Exit, any successor outside the loop.
1363///           The weight, the successor, and its category are stored in \a
1364///           Distribution.  There can be multiple edges to each successor.
1365///
1366///         - If there's a backedge to a non-header, there's an irreducible SCC.
1367///           The usual flow is temporarily aborted.  \a
1368///           computeIrreducibleMass() finds the irreducible SCCs within the
1369///           loop, packages them up, and restarts the flow.
1370///
1371///         - Normalize the distribution:  scale weights down so that their sum
1372///           is 32-bits, and coalesce multiple edges to the same node.
1373///
1374///         - Distribute the mass accordingly, dithering to minimize mass loss,
1375///           as described in \a distributeMass().
1376///
1377///     Finally, calculate the loop scale from the accumulated backedge mass.
1378///
1379///  3. Distribute mass in the function (\a computeMassInFunction()).
1380///
1381///     Finally, distribute mass through the DAG resulting from packaging all
1382///     loops in the function.  This uses the same algorithm as distributing
1383///     mass in a loop, except that there are no exit or backedge edges.
1384///
1385///  4. Unpackage loops (\a unwrapLoops()).
1386///
1387///     Initialize each block's frequency to a floating point representation of
1388///     its mass.
1389///
1390///     Visit loops top-down, scaling the frequencies of its immediate members
1391///     by the loop's pseudo-node's frequency.
1392///
1393///  5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
1394///
1395///     Using the min and max frequencies as a guide, translate floating point
1396///     frequencies to an appropriate range in uint64_t.
1397///
1398/// It has some known flaws.
1399///
1400///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
1401///     BlockFrequency's 64-bit integer precision.
1402///
1403///   - The model of irreducible control flow is a rough approximation.
1404///
1405///     Modelling irreducible control flow exactly involves setting up and
1406///     solving a group of infinite geometric series.  Such precision is
1407///     unlikely to be worthwhile, since most of our algorithms give up on
1408///     irreducible control flow anyway.
1409///
1410///     Nevertheless, we might find that we need to get closer.  Here's a sort
1411///     of TODO list for the model with diminishing returns, to be completed as
1412///     necessary.
1413///
1414///       - The headers for the \a LoopData representing an irreducible SCC
1415///         include non-entry blocks.  When these extra blocks exist, they
1416///         indicate a self-contained irreducible sub-SCC.  We could treat them
1417///         as sub-loops, rather than arbitrarily shoving the problematic
1418///         blocks into the headers of the main irreducible SCC.
1419///
1420///       - Backedge frequencies are assumed to be evenly split between the
1421///         headers of a given irreducible SCC.  Instead, we could track the
1422///         backedge mass separately for each header, and adjust their relative
1423///         frequencies.
1424///
1425///       - Entry frequencies are assumed to be evenly split between the
1426///         headers of a given irreducible SCC, which is the only option if we
1427///         need to compute mass in the SCC before its parent loop.  Instead,
1428///         we could partially compute mass in the parent loop, and stop when
1429///         we get to the SCC.  Here, we have the correct ratio of entry
1430///         masses, which we can use to adjust their relative frequencies.
1431///         Compute mass in the SCC, and then continue propagation in the
1432///         parent.
1433///
1434///       - We can propagate mass iteratively through the SCC, for some fixed
1435///         number of iterations.  Each iteration starts by assigning the entry
1436///         blocks their backedge mass from the prior iteration.  The final
1437///         mass for each block (and each exit, and the total backedge mass
1438///         used for computing loop scale) is the sum of all iterations.
1439///         (Running this until fixed point would "solve" the geometric
1440///         series by simulation.)
1441template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
1442  typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
1443  typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
1444  typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT
1445  BranchProbabilityInfoT;
1446  typedef typename bfi_detail::TypeMap<BT>::LoopT LoopT;
1447  typedef typename bfi_detail::TypeMap<BT>::LoopInfoT LoopInfoT;
1448
1449  // This is part of a workaround for a GCC 4.7 crash on lambdas.
1450  friend struct bfi_detail::BlockEdgesAdder<BT>;
1451
1452  typedef GraphTraits<const BlockT *> Successor;
1453  typedef GraphTraits<Inverse<const BlockT *>> Predecessor;
1454
1455  const BranchProbabilityInfoT *BPI;
1456  const LoopInfoT *LI;
1457  const FunctionT *F;
1458
1459  // All blocks in reverse postorder.
1460  std::vector<const BlockT *> RPOT;
1461  DenseMap<const BlockT *, BlockNode> Nodes;
1462
1463  typedef typename std::vector<const BlockT *>::const_iterator rpot_iterator;
1464
1465  rpot_iterator rpot_begin() const { return RPOT.begin(); }
1466  rpot_iterator rpot_end() const { return RPOT.end(); }
1467
1468  size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
1469
1470  BlockNode getNode(const rpot_iterator &I) const {
1471    return BlockNode(getIndex(I));
1472  }
1473  BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
1474
1475  const BlockT *getBlock(const BlockNode &Node) const {
1476    assert(Node.Index < RPOT.size());
1477    return RPOT[Node.Index];
1478  }
1479
1480  /// \brief Run (and save) a post-order traversal.
1481  ///
1482  /// Saves a reverse post-order traversal of all the nodes in \a F.
1483  void initializeRPOT();
1484
1485  /// \brief Initialize loop data.
1486  ///
1487  /// Build up \a Loops using \a LoopInfo.  \a LoopInfo gives us a mapping from
1488  /// each block to the deepest loop it's in, but we need the inverse.  For each
1489  /// loop, we store in reverse post-order its "immediate" members, defined as
1490  /// the header, the headers of immediate sub-loops, and all other blocks in
1491  /// the loop that are not in sub-loops.
1492  void initializeLoops();
1493
1494  /// \brief Propagate to a block's successors.
1495  ///
1496  /// In the context of distributing mass through \c OuterLoop, divide the mass
1497  /// currently assigned to \c Node between its successors.
1498  ///
1499  /// \return \c true unless there's an irreducible backedge.
1500  bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
1501
1502  /// \brief Compute mass in a particular loop.
1503  ///
1504  /// Assign mass to \c Loop's header, and then for each block in \c Loop in
1505  /// reverse post-order, distribute mass to its successors.  Only visits nodes
1506  /// that have not been packaged into sub-loops.
1507  ///
1508  /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
1509  /// \return \c true unless there's an irreducible backedge.
1510  bool computeMassInLoop(LoopData &Loop);
1511
1512  /// \brief Try to compute mass in the top-level function.
1513  ///
1514  /// Assign mass to the entry block, and then for each block in reverse
1515  /// post-order, distribute mass to its successors.  Skips nodes that have
1516  /// been packaged into loops.
1517  ///
1518  /// \pre \a computeMassInLoops() has been called.
1519  /// \return \c true unless there's an irreducible backedge.
1520  bool tryToComputeMassInFunction();
1521
1522  /// \brief Compute mass in (and package up) irreducible SCCs.
1523  ///
1524  /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
1525  /// of \c Insert), and call \a computeMassInLoop() on each of them.
1526  ///
1527  /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
1528  ///
1529  /// \pre \a computeMassInLoop() has been called for each subloop of \c
1530  /// OuterLoop.
1531  /// \pre \c Insert points at the the last loop successfully processed by \a
1532  /// computeMassInLoop().
1533  /// \pre \c OuterLoop has irreducible SCCs.
1534  void computeIrreducibleMass(LoopData *OuterLoop,
1535                              std::list<LoopData>::iterator Insert);
1536
1537  /// \brief Compute mass in all loops.
1538  ///
1539  /// For each loop bottom-up, call \a computeMassInLoop().
1540  ///
1541  /// \a computeMassInLoop() aborts (and returns \c false) on loops that
1542  /// contain a irreducible sub-SCCs.  Use \a computeIrreducibleMass() and then
1543  /// re-enter \a computeMassInLoop().
1544  ///
1545  /// \post \a computeMassInLoop() has returned \c true for every loop.
1546  void computeMassInLoops();
1547
1548  /// \brief Compute mass in the top-level function.
1549  ///
1550  /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
1551  /// compute mass in the top-level function.
1552  ///
1553  /// \post \a tryToComputeMassInFunction() has returned \c true.
1554  void computeMassInFunction();
1555
1556  std::string getBlockName(const BlockNode &Node) const override {
1557    return bfi_detail::getBlockName(getBlock(Node));
1558  }
1559
1560public:
1561  const FunctionT *getFunction() const { return F; }
1562
1563  void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
1564                  const LoopInfoT *LI);
1565  BlockFrequencyInfoImpl() : BPI(nullptr), LI(nullptr), F(nullptr) {}
1566
1567  using BlockFrequencyInfoImplBase::getEntryFreq;
1568  BlockFrequency getBlockFreq(const BlockT *BB) const {
1569    return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
1570  }
1571  Float getFloatingBlockFreq(const BlockT *BB) const {
1572    return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
1573  }
1574
1575  /// \brief Print the frequencies for the current function.
1576  ///
1577  /// Prints the frequencies for the blocks in the current function.
1578  ///
1579  /// Blocks are printed in the natural iteration order of the function, rather
1580  /// than reverse post-order.  This provides two advantages:  writing -analyze
1581  /// tests is easier (since blocks come out in source order), and even
1582  /// unreachable blocks are printed.
1583  ///
1584  /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
1585  /// we need to override it here.
1586  raw_ostream &print(raw_ostream &OS) const override;
1587  using BlockFrequencyInfoImplBase::dump;
1588
1589  using BlockFrequencyInfoImplBase::printBlockFreq;
1590  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
1591    return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
1592  }
1593};
1594
1595template <class BT>
1596void BlockFrequencyInfoImpl<BT>::doFunction(const FunctionT *F,
1597                                            const BranchProbabilityInfoT *BPI,
1598                                            const LoopInfoT *LI) {
1599  // Save the parameters.
1600  this->BPI = BPI;
1601  this->LI = LI;
1602  this->F = F;
1603
1604  // Clean up left-over data structures.
1605  BlockFrequencyInfoImplBase::clear();
1606  RPOT.clear();
1607  Nodes.clear();
1608
1609  // Initialize.
1610  DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
1611               << std::string(F->getName().size(), '=') << "\n");
1612  initializeRPOT();
1613  initializeLoops();
1614
1615  // Visit loops in post-order to find thelocal mass distribution, and then do
1616  // the full function.
1617  computeMassInLoops();
1618  computeMassInFunction();
1619  unwrapLoops();
1620  finalizeMetrics();
1621}
1622
1623template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
1624  const BlockT *Entry = F->begin();
1625  RPOT.reserve(F->size());
1626  std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
1627  std::reverse(RPOT.begin(), RPOT.end());
1628
1629  assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
1630         "More nodes in function than Block Frequency Info supports");
1631
1632  DEBUG(dbgs() << "reverse-post-order-traversal\n");
1633  for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
1634    BlockNode Node = getNode(I);
1635    DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
1636    Nodes[*I] = Node;
1637  }
1638
1639  Working.reserve(RPOT.size());
1640  for (size_t Index = 0; Index < RPOT.size(); ++Index)
1641    Working.emplace_back(Index);
1642  Freqs.resize(RPOT.size());
1643}
1644
1645template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
1646  DEBUG(dbgs() << "loop-detection\n");
1647  if (LI->empty())
1648    return;
1649
1650  // Visit loops top down and assign them an index.
1651  std::deque<std::pair<const LoopT *, LoopData *>> Q;
1652  for (const LoopT *L : *LI)
1653    Q.emplace_back(L, nullptr);
1654  while (!Q.empty()) {
1655    const LoopT *Loop = Q.front().first;
1656    LoopData *Parent = Q.front().second;
1657    Q.pop_front();
1658
1659    BlockNode Header = getNode(Loop->getHeader());
1660    assert(Header.isValid());
1661
1662    Loops.emplace_back(Parent, Header);
1663    Working[Header.Index].Loop = &Loops.back();
1664    DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
1665
1666    for (const LoopT *L : *Loop)
1667      Q.emplace_back(L, &Loops.back());
1668  }
1669
1670  // Visit nodes in reverse post-order and add them to their deepest containing
1671  // loop.
1672  for (size_t Index = 0; Index < RPOT.size(); ++Index) {
1673    // Loop headers have already been mostly mapped.
1674    if (Working[Index].isLoopHeader()) {
1675      LoopData *ContainingLoop = Working[Index].getContainingLoop();
1676      if (ContainingLoop)
1677        ContainingLoop->Nodes.push_back(Index);
1678      continue;
1679    }
1680
1681    const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
1682    if (!Loop)
1683      continue;
1684
1685    // Add this node to its containing loop's member list.
1686    BlockNode Header = getNode(Loop->getHeader());
1687    assert(Header.isValid());
1688    const auto &HeaderData = Working[Header.Index];
1689    assert(HeaderData.isLoopHeader());
1690
1691    Working[Index].Loop = HeaderData.Loop;
1692    HeaderData.Loop->Nodes.push_back(Index);
1693    DEBUG(dbgs() << " - loop = " << getBlockName(Header)
1694                 << ": member = " << getBlockName(Index) << "\n");
1695  }
1696}
1697
1698template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
1699  // Visit loops with the deepest first, and the top-level loops last.
1700  for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
1701    if (computeMassInLoop(*L))
1702      continue;
1703    auto Next = std::next(L);
1704    computeIrreducibleMass(&*L, L.base());
1705    L = std::prev(Next);
1706    if (computeMassInLoop(*L))
1707      continue;
1708    llvm_unreachable("unhandled irreducible control flow");
1709  }
1710}
1711
1712template <class BT>
1713bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
1714  // Compute mass in loop.
1715  DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
1716
1717  if (Loop.isIrreducible()) {
1718    BlockMass Remaining = BlockMass::getFull();
1719    for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
1720      auto &Mass = Working[Loop.Nodes[H].Index].getMass();
1721      Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
1722      Remaining -= Mass;
1723    }
1724    for (const BlockNode &M : Loop.Nodes)
1725      if (!propagateMassToSuccessors(&Loop, M))
1726        llvm_unreachable("unhandled irreducible control flow");
1727  } else {
1728    Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
1729    if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
1730      llvm_unreachable("irreducible control flow to loop header!?");
1731    for (const BlockNode &M : Loop.members())
1732      if (!propagateMassToSuccessors(&Loop, M))
1733        // Irreducible backedge.
1734        return false;
1735  }
1736
1737  computeLoopScale(Loop);
1738  packageLoop(Loop);
1739  return true;
1740}
1741
1742template <class BT>
1743bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
1744  // Compute mass in function.
1745  DEBUG(dbgs() << "compute-mass-in-function\n");
1746  assert(!Working.empty() && "no blocks in function");
1747  assert(!Working[0].isLoopHeader() && "entry block is a loop header");
1748
1749  Working[0].getMass() = BlockMass::getFull();
1750  for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
1751    // Check for nodes that have been packaged.
1752    BlockNode Node = getNode(I);
1753    if (Working[Node.Index].isPackaged())
1754      continue;
1755
1756    if (!propagateMassToSuccessors(nullptr, Node))
1757      return false;
1758  }
1759  return true;
1760}
1761
1762template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
1763  if (tryToComputeMassInFunction())
1764    return;
1765  computeIrreducibleMass(nullptr, Loops.begin());
1766  if (tryToComputeMassInFunction())
1767    return;
1768  llvm_unreachable("unhandled irreducible control flow");
1769}
1770
1771/// \note This should be a lambda, but that crashes GCC 4.7.
1772namespace bfi_detail {
1773template <class BT> struct BlockEdgesAdder {
1774  typedef BT BlockT;
1775  typedef BlockFrequencyInfoImplBase::LoopData LoopData;
1776  typedef GraphTraits<const BlockT *> Successor;
1777
1778  const BlockFrequencyInfoImpl<BT> &BFI;
1779  explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
1780      : BFI(BFI) {}
1781  void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
1782                  const LoopData *OuterLoop) {
1783    const BlockT *BB = BFI.RPOT[Irr.Node.Index];
1784    for (auto I = Successor::child_begin(BB), E = Successor::child_end(BB);
1785         I != E; ++I)
1786      G.addEdge(Irr, BFI.getNode(*I), OuterLoop);
1787  }
1788};
1789}
1790template <class BT>
1791void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
1792    LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
1793  DEBUG(dbgs() << "analyze-irreducible-in-";
1794        if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
1795        else dbgs() << "function\n");
1796
1797  using namespace bfi_detail;
1798  // Ideally, addBlockEdges() would be declared here as a lambda, but that
1799  // crashes GCC 4.7.
1800  BlockEdgesAdder<BT> addBlockEdges(*this);
1801  IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
1802
1803  for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
1804    computeMassInLoop(L);
1805
1806  if (!OuterLoop)
1807    return;
1808  updateLoopWithIrreducible(*OuterLoop);
1809}
1810
1811template <class BT>
1812bool
1813BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
1814                                                      const BlockNode &Node) {
1815  DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
1816  // Calculate probability for successors.
1817  Distribution Dist;
1818  if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
1819    assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
1820    if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
1821      // Irreducible backedge.
1822      return false;
1823  } else {
1824    const BlockT *BB = getBlock(Node);
1825    for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
1826         SI != SE; ++SI)
1827      // Do not dereference SI, or getEdgeWeight() is linear in the number of
1828      // successors.
1829      if (!addToDist(Dist, OuterLoop, Node, getNode(*SI),
1830                     BPI->getEdgeWeight(BB, SI)))
1831        // Irreducible backedge.
1832        return false;
1833  }
1834
1835  // Distribute mass to successors, saving exit and backedge data in the
1836  // loop header.
1837  distributeMass(Node, OuterLoop, Dist);
1838  return true;
1839}
1840
1841template <class BT>
1842raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
1843  if (!F)
1844    return OS;
1845  OS << "block-frequency-info: " << F->getName() << "\n";
1846  for (const BlockT &BB : *F)
1847    OS << " - " << bfi_detail::getBlockName(&BB)
1848       << ": float = " << getFloatingBlockFreq(&BB)
1849       << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
1850
1851  // Add an extra newline for readability.
1852  OS << "\n";
1853  return OS;
1854}
1855}
1856
1857#undef DEBUG_TYPE
1858
1859#endif
1860