diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index f891afdf551..66d27b7e4a7 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// Shared implementation of BlockFrequencyInfo for IR and Machine Instructions. +// Shared implementation of BlockFrequency for IR and Machine Instructions. // //===----------------------------------------------------------------------===// @@ -16,8 +16,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/BasicBlock.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -26,13 +24,1300 @@ #include #include +//===----------------------------------------------------------------------===// +// +// PositiveFloat definition. +// +// TODO: Make this private to BlockFrequencyInfoImpl or delete. +// +//===----------------------------------------------------------------------===// namespace llvm { +class PositiveFloatBase { +public: + static const int MaxExponent = 16383; + static const int MinExponent = -16382; + static const int DefaultPrecision = 10; + static void dump(uint64_t D, int16_t E, int Width); + static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, + unsigned Precision); + static std::string toString(uint64_t D, int16_t E, int Width, + unsigned Precision); + static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); } + static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); } + static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); } + + static std::pair splitSigned(int64_t N) { + if (N >= 0) + return std::make_pair(N, false); + if (N == INT64_MIN) + return std::make_pair(uint64_t(N) + 1, true); + return std::make_pair(-N, true); + } + static int64_t joinSigned(uint64_t U, bool IsNeg) { + if (U > INT64_MAX) + return IsNeg ? INT64_MIN : INT64_MAX; + return IsNeg ? -int16_t(U) : U; + } + + static int32_t extractLg(const std::pair &Lg) { + return Lg.first; + } + static int32_t extractLgFloor(const std::pair &Lg) { + return Lg.first - (Lg.second > 0); + } + static int32_t extractLgCeiling(const std::pair &Lg) { + return Lg.first + (Lg.second < 0); + } + static uint64_t getDiff(int16_t L, int16_t R) { + assert(L <= R && "arguments in wrong order"); + return (uint64_t)R - (uint64_t)L; + } + + static std::pair divide64(uint64_t L, uint64_t R); + static std::pair multiply64(uint64_t L, uint64_t R); + + static int compare(uint64_t L, uint64_t R, int Shift) { + assert(Shift >= 0); + assert(Shift < 64); + + uint64_t L_adjusted = L >> Shift; + if (L_adjusted < R) + return -1; + if (L_adjusted > R) + return 1; + + return L > L_adjusted << Shift ? 1 : 0; + } +}; + +/// \brief Simple representation of a positive floating point. +/// +/// PositiveFloat is a positive floating point number. It uses simple +/// saturation arithmetic, and every operation is well-defined for every value. +/// +/// The number is split into a signed exponent and unsigned digits. The number +/// represented is \c getDigits()*2^getExponent(). In this way, the digits are +/// much like the mantissa in the x87 long double, but there is no canonical +/// form, so the same number can be represented by many bit representations +/// (it's always in "denormal" mode). +/// +/// PositiveFloat is templated on the underlying integer type for digits, which +/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t. +/// +/// Unlike builtin floating point types, PositiveFloat is portable. +/// +/// Unlike APFloat, PositiveFloat does not model architecture floating point +/// behaviour (this should make it a little faster), and implements most +/// operators (this makes it usable). +/// +/// PositiveFloat is totally ordered. However, there is no canonical form, so +/// there are multiple representations of most scalars. E.g.: +/// +/// PositiveFloat(8u, 0) == PositiveFloat(4u, 1) +/// PositiveFloat(4u, 1) == PositiveFloat(2u, 2) +/// PositiveFloat(2u, 2) == PositiveFloat(1u, 3) +/// +/// PositiveFloat implements most arithmetic operations. Precision is kept +/// where possible. Uses simple saturation arithmetic, so that operations +/// saturate to 0.0 or getLargest() rather than under or overflowing. It has +/// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0. +/// Any other division by 0.0 is defined to be getLargest(). +/// +/// As a convenience for modifying the exponent, left and right shifting are +/// both implemented, and both interpret negative shifts as positive shifts in +/// the opposite direction. +/// +/// Future work might extract most of the implementation into a base class +/// (e.g., \c Float) that has an \c IsSigned template parameter. The initial +/// use case for this only needed positive semantics, but it wouldn't take much +/// work to extend. +/// +/// Exponents are limited to the range accepted by x87 long double. This makes +/// it trivial to add functionality to convert to APFloat (this is already +/// relied on for the implementation of printing). +template class PositiveFloat : PositiveFloatBase { +public: + static_assert(!std::numeric_limits::is_signed, + "only unsigned floats supported"); + + typedef DigitsT DigitsType; + +private: + typedef std::numeric_limits DigitsLimits; + + static const int Width = sizeof(DigitsType) * 8; + static_assert(Width <= 64, "invalid integer width for digits"); + +private: + DigitsType Digits; + int16_t Exponent; + +public: + PositiveFloat() : Digits(0), Exponent(0) {} + + PositiveFloat(DigitsType Digits, int16_t Exponent) + : Digits(Digits), Exponent(Exponent) {} + +private: + PositiveFloat(const std::pair &X) + : Digits(X.first), Exponent(X.second) {} + +public: + static PositiveFloat getZero() { return PositiveFloat(0, 0); } + static PositiveFloat getOne() { return PositiveFloat(1, 0); } + static PositiveFloat getLargest() { + return PositiveFloat(DigitsLimits::max(), MaxExponent); + } + static PositiveFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); } + static PositiveFloat getInverseFloat(uint64_t N) { + return getFloat(N).invert(); + } + static PositiveFloat getFraction(DigitsType N, DigitsType D) { + return getQuotient(N, D); + } + + int16_t getExponent() const { return Exponent; } + DigitsType getDigits() const { return Digits; } + + template IntT toInt() const; + + bool isZero() const { return !Digits; } + bool isLargest() const { return *this == getLargest(); } + bool isOne() const { + if (Exponent > 0 || Exponent <= -Width) + return false; + return Digits == DigitsType(1) << -Exponent; + } + + /// \brief The log base 2, rounded. + /// + /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN. + int32_t lg() const { return extractLg(lgImpl()); } + + /// \brief The log base 2, rounded towards INT32_MIN. + /// + /// Get the lg floor. lg 0 is defined to be INT32_MIN. + int32_t lgFloor() const { return extractLgFloor(lgImpl()); } + + /// \brief The log base 2, rounded towards INT32_MAX. + /// + /// Get the lg ceiling. lg 0 is defined to be INT32_MIN. + int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); } + + bool operator==(const PositiveFloat &X) const { return compare(X) == 0; } + bool operator<(const PositiveFloat &X) const { return compare(X) < 0; } + bool operator!=(const PositiveFloat &X) const { return compare(X) != 0; } + bool operator>(const PositiveFloat &X) const { return compare(X) > 0; } + bool operator<=(const PositiveFloat &X) const { return compare(X) <= 0; } + bool operator>=(const PositiveFloat &X) const { return compare(X) >= 0; } + + bool operator!() const { return isZero(); } + + /// \brief Convert to a decimal representation in a string. + /// + /// Convert to a string. Uses scientific notation for very large/small + /// numbers. Scientific notation is used roughly for numbers outside of the + /// range 2^-64 through 2^64. + /// + /// \c Precision indicates the number of decimal digits of precision to use; + /// 0 requests the maximum available. + /// + /// As a special case to make debugging easier, if the number is small enough + /// to convert without scientific notation and has more than \c Precision + /// digits before the decimal place, it's printed accurately to the first + /// digit past zero. E.g., assuming 10 digits of precision: + /// + /// 98765432198.7654... => 98765432198.8 + /// 8765432198.7654... => 8765432198.8 + /// 765432198.7654... => 765432198.8 + /// 65432198.7654... => 65432198.77 + /// 5432198.7654... => 5432198.765 + std::string toString(unsigned Precision = DefaultPrecision) { + return PositiveFloatBase::toString(Digits, Exponent, Width, Precision); + } + + /// \brief Print a decimal representation. + /// + /// Print a string. See toString for documentation. + raw_ostream &print(raw_ostream &OS, + unsigned Precision = DefaultPrecision) const { + return PositiveFloatBase::print(OS, Digits, Exponent, Width, Precision); + } + void dump() const { return PositiveFloatBase::dump(Digits, Exponent, Width); } + + PositiveFloat &operator+=(const PositiveFloat &X); + PositiveFloat &operator-=(const PositiveFloat &X); + PositiveFloat &operator*=(const PositiveFloat &X); + PositiveFloat &operator/=(const PositiveFloat &X); + PositiveFloat &operator<<=(int16_t Shift) { return shiftLeft(Shift); } + PositiveFloat &operator>>=(int16_t Shift) { return shiftRight(Shift); } + +private: + PositiveFloat &shiftLeft(int32_t Shift); + PositiveFloat &shiftRight(int32_t Shift); + PositiveFloat normalizeExponents(PositiveFloat X); + +public: + /// \brief Scale a large number accurately. + /// + /// Scale N (multiply it by this). Uses full precision multiplication, even + /// if Width is smaller than 64, so information is not lost. + uint64_t scale(uint64_t N) const; + uint64_t scaleByInverse(uint64_t N) const { + // TODO: implement directly, rather than relying on inverse. Inverse is + // expensive. + return inverse().scale(N); + } + int64_t scale(int64_t N) const { + std::pair Unsigned = splitSigned(N); + return joinSigned(scale(Unsigned.first), Unsigned.second); + } + int64_t scaleByInverse(int64_t N) const { + std::pair Unsigned = splitSigned(N); + return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second); + } + + int compare(const PositiveFloat &X) const; + int compareTo(uint64_t N) const { + PositiveFloat Float = getFloat(N); + int Compare = compare(Float); + if (Width == 64 || Compare != 0) + return Compare; + + // Check for precision loss. We know *this == RoundTrip. + uint64_t RoundTrip = Float.template toInt(); + return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1; + } + int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); } + + PositiveFloat &invert() { return *this = PositiveFloat::getFloat(1) / *this; } + PositiveFloat inverse() const { return PositiveFloat(*this).invert(); } + +private: + static PositiveFloat getProduct(DigitsType L, DigitsType R); + static PositiveFloat getQuotient(DigitsType Dividend, DigitsType Divisor); + + std::pair lgImpl() const; + static int countLeadingZerosWidth(DigitsType Digits) { + if (Width == 64) + return countLeadingZeros64(Digits); + if (Width == 32) + return countLeadingZeros32(Digits); + return countLeadingZeros32(Digits) + Width - 32; + } + + static PositiveFloat adjustToWidth(uint64_t N, int S) { + assert(S >= MinExponent); + assert(S <= MaxExponent); + if (Width == 64 || N <= DigitsLimits::max()) + return PositiveFloat(N, S); + + // Shift right. + int Shift = 64 - Width - countLeadingZeros64(N); + DigitsType Shifted = N >> Shift; + + // Round. + assert(S + Shift <= MaxExponent); + return getRounded(PositiveFloat(Shifted, S + Shift), + N & UINT64_C(1) << (Shift - 1)); + } + + static PositiveFloat getRounded(PositiveFloat P, bool Round) { + if (!Round) + return P; + if (P.Digits == DigitsLimits::max()) + // Careful of overflow in the exponent. + return PositiveFloat(1, P.Exponent) <<= Width; + return PositiveFloat(P.Digits + 1, P.Exponent); + } +}; + +template +PositiveFloat operator+(const PositiveFloat &L, + const PositiveFloat &R) { + return PositiveFloat(L) += R; +} +template +PositiveFloat operator-(const PositiveFloat &L, + const PositiveFloat &R) { + return PositiveFloat(L) -= R; +} +template +PositiveFloat operator*(const PositiveFloat &L, + const PositiveFloat &R) { + return PositiveFloat(L) *= R; +} +template +PositiveFloat operator/(const PositiveFloat &L, + const PositiveFloat &R) { + return PositiveFloat(L) /= R; +} +template +PositiveFloat operator<<(const PositiveFloat &F, + int16_t Shift) { + return PositiveFloat(F) <<= Shift; +} +template +PositiveFloat operator>>(const PositiveFloat &F, + int16_t Shift) { + return PositiveFloat(F) >>= Shift; +} + +template +raw_ostream &operator<<(raw_ostream &OS, const PositiveFloat &X) { + return X.print(OS, 10); +} + +template +bool operator<(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) < 0; +} +template +bool operator>(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) > 0; +} +template +bool operator==(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) == 0; +} +template +bool operator!=(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) != 0; +} +template +bool operator<=(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) <= 0; +} +template +bool operator>=(const PositiveFloat &L, uint64_t R) { + return L.compareTo(R) >= 0; +} + +template +bool operator<(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) < 0; +} +template +bool operator>(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) > 0; +} +template +bool operator==(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) == 0; +} +template +bool operator!=(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) != 0; +} +template +bool operator<=(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) <= 0; +} +template +bool operator>=(const PositiveFloat &L, int64_t R) { + return L.compareTo(R) >= 0; +} + +template +bool operator<(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) < 0; +} +template +bool operator>(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) > 0; +} +template +bool operator==(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) == 0; +} +template +bool operator!=(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) != 0; +} +template +bool operator<=(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) <= 0; +} +template +bool operator>=(const PositiveFloat &L, uint32_t R) { + return L.compareTo(uint64_t(R)) >= 0; +} + +template +bool operator<(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) < 0; +} +template +bool operator>(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) > 0; +} +template +bool operator==(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) == 0; +} +template +bool operator!=(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) != 0; +} +template +bool operator<=(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) <= 0; +} +template +bool operator>=(const PositiveFloat &L, int32_t R) { + return L.compareTo(int64_t(R)) >= 0; +} + +template +bool operator<(uint64_t L, const PositiveFloat &R) { + return R > L; +} +template +bool operator>(uint64_t L, const PositiveFloat &R) { + return R < L; +} +template +bool operator==(uint64_t L, const PositiveFloat &R) { + return R == L; +} +template +bool operator<=(uint64_t L, const PositiveFloat &R) { + return R >= L; +} +template +bool operator>=(uint64_t L, const PositiveFloat &R) { + return R <= L; +} +template +bool operator!=(uint64_t L, const PositiveFloat &R) { + return R != L; +} +template +bool operator<(int64_t L, const PositiveFloat &R) { + return R > L; +} +template +bool operator>(int64_t L, const PositiveFloat &R) { + return R < L; +} +template +bool operator==(int64_t L, const PositiveFloat &R) { + return R == L; +} +template +bool operator<=(int64_t L, const PositiveFloat &R) { + return R >= L; +} +template +bool operator>=(int64_t L, const PositiveFloat &R) { + return R <= L; +} +template +bool operator!=(int64_t L, const PositiveFloat &R) { + return R != L; +} +template +bool operator<(uint32_t L, const PositiveFloat &R) { + return R > L; +} +template +bool operator>(uint32_t L, const PositiveFloat &R) { + return R < L; +} +template +bool operator==(uint32_t L, const PositiveFloat &R) { + return R == L; +} +template +bool operator<=(uint32_t L, const PositiveFloat &R) { + return R >= L; +} +template +bool operator>=(uint32_t L, const PositiveFloat &R) { + return R <= L; +} +template +bool operator!=(uint32_t L, const PositiveFloat &R) { + return R != L; +} +template +bool operator<(int32_t L, const PositiveFloat &R) { + return R > L; +} +template +bool operator>(int32_t L, const PositiveFloat &R) { + return R < L; +} +template +bool operator==(int32_t L, const PositiveFloat &R) { + return R == L; +} +template +bool operator<=(int32_t L, const PositiveFloat &R) { + return R >= L; +} +template +bool operator>=(int32_t L, const PositiveFloat &R) { + return R <= L; +} +template +bool operator!=(int32_t L, const PositiveFloat &R) { + return R != L; +} + +template +uint64_t PositiveFloat::scale(uint64_t N) const { + if (Width == 64 || N <= DigitsLimits::max()) + return (getFloat(N) * *this).template toInt(); + std::pair Lg = lgImpl(); + if (extractLgFloor(Lg) >= 64) + return UINT64_MAX; + if (extractLgCeiling(Lg) <= -64) + return 0; + + uint64_t Result = 0; + for (int Bit = 0; Bit < 64; Bit += Width) { + PositiveFloat Digit = getFloat(N & DigitsLimits::max() << Bit); + Digit *= *this; + + uint64_t Sum = Result + (Digit.toInt() >> Bit); + if (Sum < Result) + return UINT64_MAX; + Result = Sum; + } + return Result; +} + +template +PositiveFloat PositiveFloat::getProduct(DigitsType L, + DigitsType R) { + // Check for zero. + if (!L || !R) + return getZero(); + + // Check for numbers that we can compute with 64-bit math. + if (Width <= 32) + return adjustToWidth(uint64_t(L) * uint64_t(R), 0); + + // Do the full thing. + return PositiveFloat(multiply64(L, R)); +} +template +PositiveFloat PositiveFloat::getQuotient(DigitsType Dividend, + DigitsType Divisor) { + // Check for zero. + if (!Dividend) + return getZero(); + if (!Divisor) + return getLargest(); + + if (Width == 64) + return PositiveFloat(divide64(Dividend, Divisor)); + + // We can compute this with 64-bit math. + int Shift = countLeadingZeros64(Dividend); + uint64_t Shifted = uint64_t(Dividend) << Shift; + uint64_t Quotient = Shifted / Divisor; + + // If Quotient needs to be shifted, then adjustToWidth will round. + if (Quotient > DigitsLimits::max()) + return adjustToWidth(Quotient, -Shift); + + // Round based on the value of the next bit. + return getRounded(PositiveFloat(Quotient, -Shift), + Shifted % Divisor >= getHalf(Divisor)); +} + +template +template +IntT PositiveFloat::toInt() const { + typedef std::numeric_limits Limits; + if (*this < 1) + return 0; + if (*this >= Limits::max()) + return Limits::max(); + + IntT N = Digits; + if (Exponent > 0) { + assert(size_t(Exponent) < sizeof(IntT) * 8); + return N << Exponent; + } + if (Exponent < 0) { + assert(size_t(-Exponent) < sizeof(IntT) * 8); + return N >> -Exponent; + } + return N; +} + +template +std::pair PositiveFloat::lgImpl() const { + if (isZero()) + return std::make_pair(INT32_MIN, 0); + + // Get the floor of the lg of Digits. + int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1; + + // Get the floor of the lg of this. + int32_t Floor = Exponent + LocalFloor; + if (Digits == UINT64_C(1) << LocalFloor) + return std::make_pair(Floor, 0); + + // Round based on the next digit. + bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); + return std::make_pair(Floor + Round, Round ? 1 : -1); +} + +template +PositiveFloat +PositiveFloat::normalizeExponents(PositiveFloat X) { + if (isZero() || X.isZero()) + return X; + + if (Exponent > X.Exponent) { + // Reverse the arguments. + *this = X.normalizeExponents(*this); + return X; + } + + if (Exponent == X.Exponent) + return X; + + int ExponentDiff = getDiff(Exponent, X.Exponent); + if (ExponentDiff >= 2 * Width) { + *this = getZero(); + return X; + } + + // Use up any leading zeros on X, and then shift this. + int ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff); + int ShiftThis = ExponentDiff - ShiftX; + + if (ShiftThis >= Width) { + *this = getZero(); + return X; + } + + X.Digits <<= ShiftX; + X.Exponent -= ShiftX; + Digits >>= ShiftThis; + Exponent += ShiftThis; + return X; +} + +template +PositiveFloat &PositiveFloat:: +operator+=(const PositiveFloat &X) { + if (isLargest() || X.isZero()) + return *this; + if (isZero() || X.isLargest()) + return *this = X; + + // Normalize exponents. + PositiveFloat Scaled = normalizeExponents(X); + + // Check for zero again. + if (isZero()) + return *this = Scaled; + if (Scaled.isZero()) + return *this; + + // Compute sum. + DigitsType Sum = Digits + Scaled.Digits; + bool DidOverflow = Sum < Digits || Sum < Scaled.Digits; + Digits = Sum; + if (!DidOverflow) + return *this; + + if (Exponent == MaxExponent) + return *this = getLargest(); + + ++Exponent; + Digits = Digits >> 1 | UINT64_C(1) << (Width - 1); + + return *this; +} +template +PositiveFloat &PositiveFloat:: +operator-=(const PositiveFloat &X) { + if (X.isZero()) + return *this; + if (*this <= X) + return *this = getZero(); + + // Normalize exponents. + PositiveFloat Scaled = normalizeExponents(X); + assert(Digits >= Scaled.Digits); + + // Compute difference. + if (!Scaled.isZero()) { + Digits -= Scaled.Digits; + return *this; + } + + // Check if X just barely lost its last bit. E.g., for 32-bit: + // + // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 + if (*this == PositiveFloat(1, X.lgFloor() + Width)) { + Digits = DigitsType(0) - 1; + --Exponent; + } + return *this; +} +template +PositiveFloat &PositiveFloat:: +operator*=(const PositiveFloat &X) { + if (isZero()) + return *this; + if (X.isZero()) + return *this = X; + + // Save the exponents. + int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent); + + // Get the raw product. + *this = getProduct(Digits, X.Digits); + + // Combine with exponents. + return *this <<= Exponents; +} +template +PositiveFloat &PositiveFloat:: +operator/=(const PositiveFloat &X) { + if (isZero()) + return *this; + if (X.isZero()) + return *this = getLargest(); + + // Save the exponents. + int32_t Exponents = int32_t(Exponent) + -int32_t(X.Exponent); + + // Get the raw quotient. + *this = getQuotient(Digits, X.Digits); + + // Combine with exponents. + return *this <<= Exponents; +} +template +PositiveFloat &PositiveFloat::shiftLeft(int32_t Shift) { + if (Shift < 0) + return shiftRight(-Shift); + if (!Shift || isZero()) + return *this; + + // Shift as much as we can in the exponent. + int16_t ExponentShift = std::min(Shift, MaxExponent - Exponent); + Exponent += ExponentShift; + if (ExponentShift == Shift) + return *this; + + // Check this late, since it's rare. + if (isLargest()) + return *this; + + // Shift as far as possible. + int32_t RawShift = std::min(Shift, countLeadingZerosWidth(Digits)); + if (RawShift + ExponentShift < Shift) + // Saturate. + return *this = getLargest(); + + Digits <<= Shift; + return *this; +} + +template +PositiveFloat &PositiveFloat::shiftRight(int32_t Shift) { + if (Shift < 0) + return shiftLeft(-Shift); + if (!Shift || isZero()) + return *this; + + // Shift as much as we can in the exponent. + int16_t ExponentShift = std::min(Shift, Exponent - MinExponent); + Exponent -= ExponentShift; + if (ExponentShift == Shift) + return *this; + + // Shift as far as possible. + int32_t RawShift = Shift - ExponentShift; + if (RawShift >= Width) + // Saturate. + return *this = getZero(); + + // May result in zero. + Digits >>= Shift; + return *this; +} + +template +int PositiveFloat::compare(const PositiveFloat &X) const { + // Check for zero. + if (isZero()) + return X.isZero() ? 0 : -1; + if (X.isZero()) + return 1; + + // Check for the scale. Use lgFloor to be sure that the exponent difference + // is always lower than 64. + int32_t lgL = lgFloor(), lgR = X.lgFloor(); + if (lgL != lgR) + return lgL < lgR ? -1 : 1; + + // Compare digits. + if (Exponent < X.Exponent) + return PositiveFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent); + + return -PositiveFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent); +} + +template struct isPodLike> { + static const bool value = true; +}; +} + +//===----------------------------------------------------------------------===// +// +// BlockMass definition. +// +// TODO: Make this private to BlockFrequencyInfoImpl or delete. +// +//===----------------------------------------------------------------------===// +namespace llvm { + +/// \brief Mass of a block. +/// +/// This class implements a sort of fixed-point fraction always between 0.0 and +/// 1.0. getMass() == UINT64_MAX indicates a value of 1.0. +/// +/// Masses can be added and subtracted. Simple saturation arithmetic is used, +/// so arithmetic operations never overflow or underflow. +/// +/// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses +/// an inexpensive floating-point algorithm that's off-by-one (almost, but not +/// quite, maximum precision). +/// +/// Masses can be scaled by \a BranchProbability at maximum precision. +class BlockMass { + uint64_t Mass; + +public: + BlockMass() : Mass(0) {} + explicit BlockMass(uint64_t Mass) : Mass(Mass) {} + + static BlockMass getEmpty() { return BlockMass(); } + static BlockMass getFull() { return BlockMass(UINT64_MAX); } + + uint64_t getMass() const { return Mass; } + + bool isFull() const { return Mass == UINT64_MAX; } + bool isEmpty() const { return !Mass; } + + bool operator!() const { return isEmpty(); } + + /// \brief Add another mass. + /// + /// Adds another mass, saturating at \a isFull() rather than overflowing. + BlockMass &operator+=(const BlockMass &X) { + uint64_t Sum = Mass + X.Mass; + Mass = Sum < Mass ? UINT64_MAX : Sum; + return *this; + } + + /// \brief Subtract another mass. + /// + /// Subtracts another mass, saturating at \a isEmpty() rather than + /// undeflowing. + BlockMass &operator-=(const BlockMass &X) { + uint64_t Diff = Mass - X.Mass; + Mass = Diff > Mass ? 0 : Diff; + return *this; + } + + /// \brief Scale by another mass. + /// + /// The current implementation is a little imprecise, but it's relatively + /// fast, never overflows, and maintains the property that 1.0*1.0==1.0 + /// (where isFull represents the number 1.0). It's an approximation of + /// 128-bit multiply that gets right-shifted by 64-bits. + /// + /// For a given digit size, multiplying two-digit numbers looks like: + /// + /// U1 . L1 + /// * U2 . L2 + /// ============ + /// 0 . . L1*L2 + /// + 0 . U1*L2 . 0 // (shift left once by a digit-size) + /// + 0 . U2*L1 . 0 // (shift left once by a digit-size) + /// + U1*L2 . 0 . 0 // (shift left twice by a digit-size) + /// + /// BlockMass has 64-bit numbers. Split each into two 32-bit digits, stored + /// 64-bit. Add 1 to the lower digits, to model isFull as 1.0; this won't + /// overflow, since we have 64-bit storage for each digit. + /// + /// To do this accurately, (a) multiply into two 64-bit digits, incrementing + /// the upper digit on overflows of the lower digit (carry), (b) subtract 1 + /// from the lower digit, decrementing the upper digit on underflow (carry), + /// and (c) truncate the lower digit. For the 1.0*1.0 case, the upper digit + /// will be 0 at the end of step (a), and then will underflow back to isFull + /// (1.0) in step (b). + /// + /// Instead, the implementation does something a little faster with a small + /// loss of accuracy: ignore the lower 64-bit digit entirely. The loss of + /// accuracy is small, since the sum of the unmodelled carries is 0 or 1 + /// (i.e., step (a) will overflow at most once, and step (b) will underflow + /// only if step (a) overflows). + /// + /// This is the formula we're calculating: + /// + /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>32 + (U2 * (L1+1))>>32 + /// + /// As a demonstration of 1.0*1.0, consider two 4-bit numbers that are both + /// full (1111). + /// + /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>2 + (U2 * (L1+1))>>2 + /// 11.11 * 11.11 == 11 * 11 + (11 * (11+1))/4 + (11 * (11+1))/4 + /// == 1001 + (11 * 100)/4 + (11 * 100)/4 + /// == 1001 + 1100/4 + 1100/4 + /// == 1001 + 0011 + 0011 + /// == 1111 + BlockMass &operator*=(const BlockMass &X) { + uint64_t U1 = Mass >> 32, L1 = Mass & UINT32_MAX, U2 = X.Mass >> 32, + L2 = X.Mass & UINT32_MAX; + Mass = U1 * U2 + (U1 * (L2 + 1) >> 32) + ((L1 + 1) * U2 >> 32); + return *this; + } + + /// \brief Multiply by a branch probability. + /// + /// Multiply by P. Guarantees full precision. + /// + /// This could be naively implemented by multiplying by the numerator and + /// dividing by the denominator, but in what order? Multiplying first can + /// overflow, while dividing first will lose precision (potentially, changing + /// a non-zero mass to zero). + /// + /// The implementation mixes the two methods. Since \a BranchProbability + /// uses 32-bits and \a BlockMass 64-bits, shift the mass as far to the left + /// as there is room, then divide by the denominator to get a quotient. + /// Multiplying by the numerator and right shifting gives a first + /// approximation. + /// + /// Calculate the error in this first approximation by calculating the + /// opposite mass (multiply by the opposite numerator and shift) and + /// subtracting both from teh original mass. + /// + /// Add to the first approximation the correct fraction of this error value. + /// This time, multiply first and then divide, since there is no danger of + /// overflow. + /// + /// \pre P represents a fraction between 0.0 and 1.0. + BlockMass &operator*=(const BranchProbability &P); + + bool operator==(const BlockMass &X) const { return Mass == X.Mass; } + bool operator<(const BlockMass &X) const { return Mass < X.Mass; } + bool operator!=(const BlockMass &X) const { return !(*this == X); } + bool operator>(const BlockMass &X) const { return X < *this; } + bool operator<=(const BlockMass &X) const { return !(*this > X); } + bool operator>=(const BlockMass &X) const { return !(*this < X); } + + /// \brief Convert to floating point. + /// + /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives + /// slightly above 0.0. + PositiveFloat toFloat() const; + + void dump() const; + raw_ostream &print(raw_ostream &OS) const; +}; + +inline BlockMass operator+(const BlockMass &L, const BlockMass &R) { + return BlockMass(L) += R; +} +inline BlockMass operator-(const BlockMass &L, const BlockMass &R) { + return BlockMass(L) -= R; +} +inline BlockMass operator*(const BlockMass &L, const BlockMass &R) { + return BlockMass(L) *= R; +} +inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) { + return BlockMass(L) *= R; +} +inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) { + return BlockMass(R) *= L; +} + +inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) { + return X.print(OS); +} + +template <> struct isPodLike { + static const bool value = true; +}; +} + +//===----------------------------------------------------------------------===// +// +// BlockFrequencyInfoImpl definition. +// +//===----------------------------------------------------------------------===// +namespace llvm { + +class BasicBlock; class BranchProbabilityInfo; -class BlockFrequencyInfo; +class Function; +class Loop; +class LoopInfo; +class MachineBasicBlock; class MachineBranchProbabilityInfo; -class MachineBlockFrequencyInfo; +class MachineFunction; +class MachineLoop; +class MachineLoopInfo; + +/// \brief Base class for BlockFrequencyInfoImpl +/// +/// BlockFrequencyInfoImplBase has supporting data structures and some +/// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on +/// the block type (or that call such algorithms) are skipped here. +/// +/// Nevertheless, the majority of the overall algorithm documention lives with +/// BlockFrequencyInfoImpl. See there for details. +class BlockFrequencyInfoImplBase { +public: + typedef PositiveFloat Float; + + /// \brief Representative of a block. + /// + /// This is a simple wrapper around an index into the reverse-post-order + /// traversal of the blocks. + /// + /// Unlike a block pointer, its order has meaning (location in the + /// topological sort) and it's class is the same regardless of block type. + struct BlockNode { + typedef uint32_t IndexType; + IndexType Index; + + bool operator==(const BlockNode &X) const { return Index == X.Index; } + bool operator!=(const BlockNode &X) const { return Index != X.Index; } + bool operator<=(const BlockNode &X) const { return Index <= X.Index; } + bool operator>=(const BlockNode &X) const { return Index >= X.Index; } + bool operator<(const BlockNode &X) const { return Index < X.Index; } + bool operator>(const BlockNode &X) const { return Index > X.Index; } + + BlockNode() : Index(UINT32_MAX) {} + BlockNode(IndexType Index) : Index(Index) {} + + bool isValid() const { return Index <= getMaxIndex(); } + static size_t getMaxIndex() { return UINT32_MAX - 1; } + }; + + /// \brief Stats about a block itself. + struct FrequencyData { + Float Floating; + uint64_t Integer; + }; + + /// \brief Index of loop information. + struct WorkingData { + BlockNode ContainingLoop; ///< The block whose loop this block is inside. + uint32_t LoopIndex; ///< Index into PackagedLoops. + bool IsPackaged; ///< Has ContainingLoop been packaged up? + bool IsAPackage; ///< Has this block's loop been packaged up? + BlockMass Mass; ///< Mass distribution from the entry block. + + WorkingData() + : LoopIndex(UINT32_MAX), IsPackaged(false), IsAPackage(false) {} + + bool hasLoopHeader() const { return ContainingLoop.isValid(); } + bool isLoopHeader() const { return LoopIndex != UINT32_MAX; } + }; + + /// \brief Unscaled probability weight. + /// + /// Probability weight for an edge in the graph (including the + /// successor/target node). + /// + /// All edges in the original function are 32-bit. However, exit edges from + /// loop packages are taken from 64-bit exit masses, so we need 64-bits of + /// space in general. + /// + /// In addition to the raw weight amount, Weight stores the type of the edge + /// in the current context (i.e., the context of the loop being processed). + /// Is this a local edge within the loop, an exit from the loop, or a + /// backedge to the loop header? + struct Weight { + enum DistType { Local, Exit, Backedge }; + DistType Type; + BlockNode TargetNode; + uint64_t Amount; + Weight() : Type(Local), Amount(0) {} + }; + + /// \brief Distribution of unscaled probability weight. + /// + /// Distribution of unscaled probability weight to a set of successors. + /// + /// This class collates the successor edge weights for later processing. + /// + /// \a DidOverflow indicates whether \a Total did overflow while adding to + /// the distribution. It should never overflow twice. There's no flag for + /// whether \a ForwardTotal overflows, since when \a Total exceeds 32-bits + /// they both get re-computed during \a normalize(). + struct Distribution { + typedef SmallVector WeightList; + WeightList Weights; ///< Individual successor weights. + uint64_t Total; ///< Sum of all weights. + bool DidOverflow; ///< Whether \a Total did overflow. + uint32_t ForwardTotal; ///< Total excluding backedges. + + Distribution() : Total(0), DidOverflow(false), ForwardTotal(0) {} + void addLocal(const BlockNode &Node, uint64_t Amount) { + add(Node, Amount, Weight::Local); + } + void addExit(const BlockNode &Node, uint64_t Amount) { + add(Node, Amount, Weight::Exit); + } + void addBackedge(const BlockNode &Node, uint64_t Amount) { + add(Node, Amount, Weight::Backedge); + } + + /// \brief Normalize the distribution. + /// + /// Combines multiple edges to the same \a Weight::TargetNode and scales + /// down so that \a Total fits into 32-bits. + /// + /// This is linear in the size of \a Weights. For the vast majority of + /// cases, adjacent edge weights are combined by sorting WeightList and + /// combining adjacent weights. However, for very large edge lists an + /// auxiliary hash table is used. + void normalize(); + + private: + void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type); + }; + + /// \brief Data for a packaged loop. + /// + /// Contains the data necessary to represent represent a loop as a node once + /// it's packaged. + /// + /// PackagedLoopData inherits from BlockData to give the node the necessary + /// stats. Further, it has a list of successors, list of members, and stores + /// the backedge mass assigned to this loop. + struct PackagedLoopData { + typedef SmallVector, 4> ExitMap; + typedef SmallVector MemberList; + BlockNode Header; ///< Header. + ExitMap Exits; ///< Successor edges (and weights). + MemberList Members; ///< Members of the loop. + BlockMass BackedgeMass; ///< Mass returned to loop header. + BlockMass Mass; + Float Scale; + + PackagedLoopData(const BlockNode &Header) : Header(Header) {} + }; + + /// \brief Data about each block. This is used downstream. + std::vector Freqs; + + /// \brief Loop data: see initializeLoops(). + std::vector Working; + + /// \brief Indexed information about packaged loops. + std::vector PackagedLoops; + + /// \brief Create the initial loop packages. + /// + /// Initializes PackagedLoops using the data in Working about backedges + /// and containing loops. Called by initializeLoops(). + /// + /// \post WorkingData::LoopIndex has been initialized for every loop header + /// and PackagedLoopData::Members has been initialized. + + /// \brief Add all edges out of a packaged loop to the distribution. + /// + /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each + /// successor edge. + void addLoopSuccessorsToDist(const BlockNode &LoopHead, + const BlockNode &LocalLoopHead, + Distribution &Dist); + + /// \brief Add an edge to the distribution. + /// + /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the + /// edge is forward/exit/backedge is in the context of LoopHead. Otherwise, + /// every edge should be a forward edge (since all the loops are packaged + /// up). + void addToDist(Distribution &Dist, const BlockNode &LoopHead, + const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight); + + PackagedLoopData &getLoopPackage(const BlockNode &Head) { + assert(Head.Index < Working.size()); + size_t Index = Working[Head.Index].LoopIndex; + assert(Index < PackagedLoops.size()); + return PackagedLoops[Index]; + } + + /// \brief Distribute mass according to a distribution. + /// + /// Distributes the mass in Source according to Dist. If LoopHead.isValid(), + /// backedges and exits are stored in its entry in PackagedLoops. + /// + /// Mass is distributed in parallel from two copies of the source mass. + /// + /// The first mass (forward) represents the distribution of mass through the + /// local DAG. This distribution should lose mass at loop exits and ignore + /// backedges. + /// + /// The second mass (general) represents the behavior of the loop in the + /// global context. In a given distribution from the head, how much mass + /// exits, and to where? How much mass returns to the loop head? + /// + /// The forward mass should be split up between local successors and exits, + /// but only actually distributed to the local successors. The general mass + /// should be split up between all three types of successors, but distributed + /// only to exits and backedges. + void distributeMass(const BlockNode &Source, const BlockNode &LoopHead, + Distribution &Dist); + + /// \brief Compute the loop scale for a loop. + void computeLoopScale(const BlockNode &LoopHead); + + /// \brief Package up a loop. + void packageLoop(const BlockNode &LoopHead); + + /// \brief Finalize frequency metrics. + /// + /// Unwraps loop packages, calculates final frequencies, and cleans up + /// no-longer-needed data structures. + void finalizeMetrics(); + + /// \brief Clear all memory. + void clear(); + + virtual std::string getBlockName(const BlockNode &Node) const; + + virtual raw_ostream &print(raw_ostream &OS) const { return OS; } + void dump() const { print(dbgs()); } + + Float getFloatingBlockFreq(const BlockNode &Node) const; + + BlockFrequency getBlockFreq(const BlockNode &Node) const; + + raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const; + raw_ostream &printBlockFreq(raw_ostream &OS, + const BlockFrequency &Freq) const; + + uint64_t getEntryFreq() const { + assert(!Freqs.empty()); + return Freqs[0].Integer; + } + /// \brief Virtual destructor. + /// + /// Need a virtual destructor to mask the compiler warning about + /// getBlockName(). + virtual ~BlockFrequencyInfoImplBase() {} +}; namespace bfi_detail { template struct TypeMap {}; @@ -40,360 +1325,398 @@ template <> struct TypeMap { typedef BasicBlock BlockT; typedef Function FunctionT; typedef BranchProbabilityInfo BranchProbabilityInfoT; + typedef Loop LoopT; + typedef LoopInfo LoopInfoT; }; template <> struct TypeMap { typedef MachineBasicBlock BlockT; typedef MachineFunction FunctionT; typedef MachineBranchProbabilityInfo BranchProbabilityInfoT; + typedef MachineLoop LoopT; + typedef MachineLoopInfo LoopInfoT; }; + +/// \brief Get the name of a MachineBasicBlock. +/// +/// Get the name of a MachineBasicBlock. It's templated so that including from +/// CodeGen is unnecessary (that would be a layering issue). +/// +/// This is used mainly for debug output. The name is similar to +/// MachineBasicBlock::getFullName(), but skips the name of the function. +template std::string getBlockName(const BlockT *BB) { + assert(BB && "Unexpected nullptr"); + if (BB->getBasicBlock()) + return BB->getName().str(); + return (Twine("BB") + Twine(BB->getNumber())).str(); +} +/// \brief Get the name of a BasicBlock. +template <> inline std::string getBlockName(const BasicBlock *BB) { + assert(BB && "Unexpected nullptr"); + return BB->getName().str(); +} } -/// BlockFrequencyInfoImpl implements block frequency algorithm for IR and -/// Machine Instructions. Algorithm starts with value ENTRY_FREQ -/// for the entry block and then propagates frequencies using branch weights -/// from (Machine)BranchProbabilityInfo. LoopInfo is not required because -/// algorithm can find "backedges" by itself. -template -class BlockFrequencyInfoImpl { +/// \brief Shared implementation for block frequency analysis. +/// +/// This is a shared implementation of BlockFrequencyInfo and +/// MachineBlockFrequencyInfo, and calculates the relative frequencies of +/// blocks. +/// +/// This algorithm leverages BlockMass and PositiveFloat to maintain precision, +/// separates mass distribution from loop scaling, and dithers to eliminate +/// probability mass loss. +/// +/// The implementation is split between BlockFrequencyInfoImpl, which knows the +/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and +/// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a +/// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in +/// reverse-post order. This gives two advantages: it's easy to compare the +/// relative ordering of two nodes, and maps keyed on BlockT can be represented +/// by vectors. +/// +/// This algorithm is O(V+E), unless there is irreducible control flow, in +/// which case it's O(V*E) in the worst case. +/// +/// These are the main stages: +/// +/// 0. Reverse post-order traversal (\a initializeRPOT()). +/// +/// Run a single post-order traversal and save it (in reverse) in RPOT. +/// All other stages make use of this ordering. Save a lookup from BlockT +/// to BlockNode (the index into RPOT) in Nodes. +/// +/// 1. Loop indexing (\a initializeLoops()). +/// +/// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of +/// the algorithm. In particular, store the immediate members of each loop +/// in reverse post-order. +/// +/// 2. Calculate mass and scale in loops (\a computeMassInLoops()). +/// +/// For each loop (bottom-up), distribute mass through the DAG resulting +/// from ignoring backedges and treating sub-loops as a single pseudo-node. +/// Track the backedge mass distributed to the loop header, and use it to +/// calculate the loop scale (number of loop iterations). +/// +/// Visiting loops bottom-up is a post-order traversal of loop headers. +/// For each loop, immediate members that represent sub-loops will already +/// have been visited and packaged into a pseudo-node. +/// +/// Distributing mass in a loop is a reverse-post-order traversal through +/// the loop. Start by assigning full mass to the Loop header. For each +/// node in the loop: +/// +/// - Fetch and categorize the weight distribution for its successors. +/// If this is a packaged-subloop, the weight distribution is stored +/// in \a PackagedLoopData::Exits. Otherwise, fetch it from +/// BranchProbabilityInfo. +/// +/// - Each successor is categorized as \a Weight::Local, a normal +/// forward edge within the current loop, \a Weight::Backedge, a +/// backedge to the loop header, or \a Weight::Exit, any successor +/// outside the loop. The weight, the successor, and its category +/// are stored in \a Distribution. There can be multiple edges to +/// each successor. +/// +/// - Normalize the distribution: scale weights down so that their sum +/// is 32-bits, and coalesce multiple edges to the same node. +/// +/// - Distribute the mass accordingly, dithering to minimize mass loss, +/// as described in \a distributeMass(). Mass is distributed in +/// parallel in two ways: forward, and general. Local successors +/// take their mass from the forward mass, while exit and backedge +/// successors take their mass from the general mass. Additionally, +/// exit edges use up (ignored) mass from the forward mass, and local +/// edges use up (ignored) mass from the general distribution. +/// +/// Finally, calculate the loop scale from the accumulated backedge mass. +/// +/// 3. Distribute mass in the function (\a computeMassInFunction()). +/// +/// Finally, distribute mass through the DAG resulting from packaging all +/// loops in the function. This uses the same algorithm as distributing +/// mass in a loop, except that there are no exit or backedge edges. +/// +/// 4. Loop unpackaging and cleanup (\a finalizeMetrics()). +/// +/// Initialize the frequency to a floating point representation of its +/// mass. +/// +/// Visit loops top-down (reverse post-order), scaling the loop header's +/// frequency by its psuedo-node's mass and loop scale. Keep track of the +/// minimum and maximum final frequencies. +/// +/// Using the min and max frequencies as a guide, translate floating point +/// frequencies to an appropriate range in uint64_t. +/// +/// It has some known flaws. +/// +/// - Irreducible control flow isn't modelled correctly. In particular, +/// LoopInfo and MachineLoopInfo ignore irreducible backedges. The main +/// result is that irreducible SCCs will under-scaled. No mass is lost, +/// but the computed branch weights for the loop pseudo-node will be +/// incorrect. +/// +/// Modelling irreducible control flow exactly involves setting up and +/// solving a group of infinite geometric series. Such precision is +/// unlikely to be worthwhile, since most of our algorithms give up on +/// irreducible control flow anyway. +/// +/// Nevertheless, we might find that we need to get closer. If +/// LoopInfo/MachineLoopInfo flags loops with irreducible control flow +/// (and/or the function as a whole), we can find the SCCs, compute an +/// approximate exit frequency for the SCC as a whole, and scale up +/// accordingly. +/// +/// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting +/// BlockFrequency's 64-bit integer precision. +template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { typedef typename bfi_detail::TypeMap::BlockT BlockT; typedef typename bfi_detail::TypeMap::FunctionT FunctionT; typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT BranchProbabilityInfoT; + typedef typename bfi_detail::TypeMap::LoopT LoopT; + typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT; - DenseMap Freqs; + typedef GraphTraits Successor; + typedef GraphTraits> Predecessor; - BranchProbabilityInfoT *BPI; + const BranchProbabilityInfoT *BPI; + const LoopInfoT *LI; + const FunctionT *F; - FunctionT *Fn; + // All blocks in reverse postorder. + std::vector RPOT; + DenseMap Nodes; - typedef GraphTraits< Inverse > GT; + typedef typename std::vector::const_iterator rpot_iterator; - static const uint64_t EntryFreq = 1 << 14; + rpot_iterator rpot_begin() const { return RPOT.begin(); } + rpot_iterator rpot_end() const { return RPOT.end(); } - std::string getBlockName(BasicBlock *BB) const { - return BB->getName().str(); + size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); } + + BlockNode getNode(const rpot_iterator &I) const { + return BlockNode(getIndex(I)); + } + BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); } + + const BlockT *getBlock(const BlockNode &Node) const { + return RPOT[Node.Index]; } - std::string getBlockName(MachineBasicBlock *MBB) const { - std::string str; - raw_string_ostream ss(str); - ss << "BB#" << MBB->getNumber(); + void initializeRPOT(); + void initializeLoops(); + void runOnFunction(const FunctionT *F); - if (const BasicBlock *BB = MBB->getBasicBlock()) - ss << " derived from LLVM BB " << BB->getName(); + void propagateMassToSuccessors(const BlockNode &LoopHead, + const BlockNode &Node); + void computeMassInLoops(); + void computeMassInLoop(const BlockNode &LoopHead); + void computeMassInFunction(); - return ss.str(); - } - - void setBlockFreq(BlockT *BB, BlockFrequency Freq) { - Freqs[BB] = Freq; - DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = "; - printBlockFreq(dbgs(), Freq) << "\n"); - } - - /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst - /// edge probability. - BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const { - BranchProbability Prob = BPI->getEdgeProbability(Src, Dst); - return getBlockFreq(Src) * Prob; - } - - /// incBlockFreq - Increase BB block frequency by FREQ. - /// - void incBlockFreq(BlockT *BB, BlockFrequency Freq) { - Freqs[BB] += Freq; - DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += "; - printBlockFreq(dbgs(), Freq) << " --> "; - printBlockFreq(dbgs(), Freqs[BB]) << "\n"); - } - - // All blocks in postorder. - std::vector POT; - - // Map Block -> Position in reverse-postorder list. - DenseMap RPO; - - // For each loop header, record the per-iteration probability of exiting the - // loop. This is the reciprocal of the expected number of loop iterations. - typedef DenseMap LoopExitProbMap; - LoopExitProbMap LoopExitProb; - - // (reverse-)postorder traversal iterators. - typedef typename std::vector::iterator pot_iterator; - typedef typename std::vector::reverse_iterator rpot_iterator; - - pot_iterator pot_begin() { return POT.begin(); } - pot_iterator pot_end() { return POT.end(); } - - rpot_iterator rpot_begin() { return POT.rbegin(); } - rpot_iterator rpot_end() { return POT.rend(); } - - rpot_iterator rpot_at(BlockT *BB) { - rpot_iterator I = rpot_begin(); - unsigned idx = RPO.lookup(BB); - assert(idx); - std::advance(I, idx - 1); - - assert(*I == BB); - return I; - } - - /// isBackedge - Return if edge Src -> Dst is a reachable backedge. - /// - bool isBackedge(BlockT *Src, BlockT *Dst) const { - unsigned a = RPO.lookup(Src); - if (!a) - return false; - unsigned b = RPO.lookup(Dst); - assert(b && "Destination block should be reachable"); - return a >= b; - } - - /// getSingleBlockPred - return single BB block predecessor or NULL if - /// BB has none or more predecessors. - BlockT *getSingleBlockPred(BlockT *BB) { - typename GT::ChildIteratorType - PI = GraphTraits< Inverse >::child_begin(BB), - PE = GraphTraits< Inverse >::child_end(BB); - - if (PI == PE) - return nullptr; - - BlockT *Pred = *PI; - - ++PI; - if (PI != PE) - return nullptr; - - return Pred; - } - - void doBlock(BlockT *BB, BlockT *LoopHead, - SmallPtrSet &BlocksInLoop) { - - DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n"); - setBlockFreq(BB, 0); - - if (BB == LoopHead) { - setBlockFreq(BB, EntryFreq); - return; - } - - if (BlockT *Pred = getSingleBlockPred(BB)) { - if (BlocksInLoop.count(Pred)) - setBlockFreq(BB, getEdgeFreq(Pred, BB)); - // TODO: else? irreducible, ignore it for now. - return; - } - - bool isInLoop = false; - bool isLoopHead = false; - - for (typename GT::ChildIteratorType - PI = GraphTraits< Inverse >::child_begin(BB), - PE = GraphTraits< Inverse >::child_end(BB); - PI != PE; ++PI) { - BlockT *Pred = *PI; - - if (isBackedge(Pred, BB)) { - isLoopHead = true; - } else if (BlocksInLoop.count(Pred)) { - incBlockFreq(BB, getEdgeFreq(Pred, BB)); - isInLoop = true; - } - // TODO: else? irreducible. - } - - if (!isInLoop) - return; - - if (!isLoopHead) - return; - - // This block is a loop header, so boost its frequency by the expected - // number of loop iterations. The loop blocks will be revisited so they all - // get this boost. - typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB); - assert(I != LoopExitProb.end() && "Loop header missing from table"); - Freqs[BB] /= I->second; - DEBUG(dbgs() << "Loop header scaled to "; - printBlockFreq(dbgs(), Freqs[BB]) << ".\n"); - } - - /// doLoop - Propagate block frequency down through the loop. - void doLoop(BlockT *Head, BlockT *Tail) { - DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", " - << getBlockName(Tail) << ")\n"); - - SmallPtrSet BlocksInLoop; - - for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) { - BlockT *BB = *I; - doBlock(BB, Head, BlocksInLoop); - - BlocksInLoop.insert(BB); - if (I == E) - break; - } - - // Compute loop's cyclic probability using backedges probabilities. - BlockFrequency BackFreq; - for (typename GT::ChildIteratorType - PI = GraphTraits< Inverse >::child_begin(Head), - PE = GraphTraits< Inverse >::child_end(Head); - PI != PE; ++PI) { - BlockT *Pred = *PI; - assert(Pred); - if (isBackedge(Pred, Head)) - BackFreq += getEdgeFreq(Pred, Head); - } - - // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head) - // only counts edges entering the loop, not the loop backedges. - // The probability of leaving the loop on each iteration is: - // - // ExitProb = 1 - CyclicProb - // - // The Expected number of loop iterations is: - // - // Iterations = 1 / ExitProb - // - uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1)); - uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1)); - if (N < D) - N = D - N; - else - // We'd expect N < D, but rounding and saturation means that can't be - // guaranteed. - N = 1; - - // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction. - assert(N <= D); - if (D > UINT32_MAX) { - unsigned Shift = 32 - countLeadingZeros(D); - D >>= Shift; - N >>= Shift; - if (N == 0) - N = 1; - } - BranchProbability LEP = BranchProbability(N, D); - LoopExitProb.insert(std::make_pair(Head, LEP)); - DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP - << " from 1 - "; - printBlockFreq(dbgs(), BackFreq) << " / "; - printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n"); - } - - friend class BlockFrequencyInfo; - friend class MachineBlockFrequencyInfo; - - BlockFrequencyInfoImpl() { } - - void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) { - Fn = fn; - BPI = bpi; - - // Clear everything. - RPO.clear(); - POT.clear(); - LoopExitProb.clear(); - Freqs.clear(); - - BlockT *EntryBlock = fn->begin(); - - std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT)); - - unsigned RPOidx = 0; - for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { - BlockT *BB = *I; - RPO[BB] = ++RPOidx; - DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n"); - } - - // Travel over all blocks in postorder. - for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) { - BlockT *BB = *I; - BlockT *LastTail = nullptr; - DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n"); - - for (typename GT::ChildIteratorType - PI = GraphTraits< Inverse >::child_begin(BB), - PE = GraphTraits< Inverse >::child_end(BB); - PI != PE; ++PI) { - - BlockT *Pred = *PI; - if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail])) - LastTail = Pred; - } - - if (LastTail) - doLoop(BB, LastTail); - } - - // At the end assume the whole function as a loop, and travel over it once - // again. - doLoop(*(rpot_begin()), *(pot_begin())); + std::string getBlockName(const BlockNode &Node) const override { + return bfi_detail::getBlockName(getBlock(Node)); } public: + const FunctionT *getFunction() const { return F; } - uint64_t getEntryFreq() { return EntryFreq; } + void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI, + const LoopInfoT *LI); + BlockFrequencyInfoImpl() : BPI(0), LI(0), F(0) {} - /// getBlockFreq - Return block frequency. Return 0 if we don't have it. + using BlockFrequencyInfoImplBase::getEntryFreq; BlockFrequency getBlockFreq(const BlockT *BB) const { - typename DenseMap::const_iterator - I = Freqs.find(BB); - if (I != Freqs.end()) - return I->second; - return 0; + return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB)); + } + Float getFloatingBlockFreq(const BlockT *BB) const { + return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB)); } - void print(raw_ostream &OS) const { - OS << "\n\n---- Block Freqs ----\n"; - for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) { - BlockT *BB = I++; - OS << " " << getBlockName(BB) << " = "; - printBlockFreq(OS, getBlockFreq(BB)) << "\n"; + /// \brief Print the frequencies for the current function. + /// + /// Prints the frequencies for the blocks in the current function. + /// + /// Blocks are printed in the natural iteration order of the function, rather + /// than reverse post-order. This provides two advantages: writing -analyze + /// tests is easier (since blocks come out in source order), and even + /// unreachable blocks are printed. + raw_ostream &print(raw_ostream &OS) const override; + using BlockFrequencyInfoImplBase::dump; - for (typename GraphTraits::ChildIteratorType - SI = GraphTraits::child_begin(BB), - SE = GraphTraits::child_end(BB); SI != SE; ++SI) { - BlockT *Succ = *SI; - OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ) - << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n"; - } - } + using BlockFrequencyInfoImplBase::printBlockFreq; + raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const { + return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB)); } - - void dump() const { - print(dbgs()); - } - - // Utility method that looks up the block frequency associated with BB and - // prints it to OS. - raw_ostream &printBlockFreq(raw_ostream &OS, - const BlockT *BB) { - return printBlockFreq(OS, getBlockFreq(BB)); - } - - raw_ostream &printBlockFreq(raw_ostream &OS, - const BlockFrequency &Freq) const { - // Convert fixed-point number to decimal. - uint64_t Frequency = Freq.getFrequency(); - OS << Frequency / EntryFreq << "."; - uint64_t Rem = Frequency % EntryFreq; - uint64_t Eps = 1; - do { - Rem *= 10; - Eps *= 10; - OS << Rem / EntryFreq; - Rem = Rem % EntryFreq; - } while (Rem >= Eps/2); - return OS; - } - }; +template +void BlockFrequencyInfoImpl::doFunction(const FunctionT *F, + const BranchProbabilityInfoT *BPI, + const LoopInfoT *LI) { + // Save the parameters. + this->BPI = BPI; + this->LI = LI; + this->F = F; + + // Clean up left-over data structures. + BlockFrequencyInfoImplBase::clear(); + RPOT.clear(); + Nodes.clear(); + + // Initialize. + DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n=================" + << std::string(F->getName().size(), '=') << "\n"); + initializeRPOT(); + initializeLoops(); + + // Visit loops in post-order to find thelocal mass distribution, and then do + // the full function. + computeMassInLoops(); + computeMassInFunction(); + finalizeMetrics(); +} + +template void BlockFrequencyInfoImpl::initializeRPOT() { + const BlockT *Entry = F->begin(); + RPOT.reserve(F->size()); + std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT)); + std::reverse(RPOT.begin(), RPOT.end()); + + assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() && + "More nodes in function than Block Frequency Info supports"); + + DEBUG(dbgs() << "reverse-post-order-traversal\n"); + for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { + BlockNode Node = getNode(I); + DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n"); + Nodes[*I] = Node; + } + + Working.resize(RPOT.size()); + Freqs.resize(RPOT.size()); +} + +template void BlockFrequencyInfoImpl::initializeLoops() { + DEBUG(dbgs() << "loop-detection\n"); + if (LI->empty()) + return; + + // Visit loops top down and assign them an index. + std::deque Q; + Q.insert(Q.end(), LI->begin(), LI->end()); + while (!Q.empty()) { + const LoopT *Loop = Q.front(); + Q.pop_front(); + Q.insert(Q.end(), Loop->begin(), Loop->end()); + + // Save the order this loop was visited. + BlockNode Header = getNode(Loop->getHeader()); + assert(Header.isValid()); + + Working[Header.Index].LoopIndex = PackagedLoops.size(); + PackagedLoops.emplace_back(Header); + DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n"); + } + + // Visit nodes in reverse post-order and add them to their deepest containing + // loop. + for (size_t Index = 0; Index < RPOT.size(); ++Index) { + const LoopT *Loop = LI->getLoopFor(RPOT[Index]); + if (!Loop) + continue; + + // If this is a loop header, find its parent loop (if any). + if (Working[Index].isLoopHeader()) + if (!(Loop = Loop->getParentLoop())) + continue; + + // Add this node to its containing loop's member list. + BlockNode Header = getNode(Loop->getHeader()); + assert(Header.isValid()); + const auto &HeaderData = Working[Header.Index]; + assert(HeaderData.isLoopHeader()); + + Working[Index].ContainingLoop = Header; + PackagedLoops[HeaderData.LoopIndex].Members.push_back(Index); + DEBUG(dbgs() << " - loop = " << getBlockName(Header) + << ": member = " << getBlockName(Index) << "\n"); + } +} + +template void BlockFrequencyInfoImpl::computeMassInLoops() { + // Visit loops with the deepest first, and the top-level loops last. + for (auto L = PackagedLoops.rbegin(), LE = PackagedLoops.rend(); L != LE; ++L) + computeMassInLoop(L->Header); +} + +template +void BlockFrequencyInfoImpl::computeMassInLoop(const BlockNode &LoopHead) { + // Compute mass in loop. + DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(LoopHead) << "\n"); + + Working[LoopHead.Index].Mass = BlockMass::getFull(); + propagateMassToSuccessors(LoopHead, LoopHead); + + for (const BlockNode &M : getLoopPackage(LoopHead).Members) + propagateMassToSuccessors(LoopHead, M); + + computeLoopScale(LoopHead); + packageLoop(LoopHead); +} + +template void BlockFrequencyInfoImpl::computeMassInFunction() { + // Compute mass in function. + DEBUG(dbgs() << "compute-mass-in-function\n"); + Working[0].Mass = BlockMass::getFull(); + for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) { + // Check for nodes that have been packaged. + BlockNode Node = getNode(I); + if (Working[Node.Index].hasLoopHeader()) + continue; + + propagateMassToSuccessors(BlockNode(), Node); + } +} + +template +void +BlockFrequencyInfoImpl::propagateMassToSuccessors(const BlockNode &LoopHead, + const BlockNode &Node) { + DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n"); + // Calculate probability for successors. + Distribution Dist; + if (Node != LoopHead && Working[Node.Index].isLoopHeader()) + addLoopSuccessorsToDist(LoopHead, Node, Dist); + else { + const BlockT *BB = getBlock(Node); + for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB); + SI != SE; ++SI) + // Do not dereference SI, or getEdgeWeight() is linear in the number of + // successors. + addToDist(Dist, LoopHead, Node, getNode(*SI), BPI->getEdgeWeight(BB, SI)); + } + + // Distribute mass to successors, saving exit and backedge data in the + // loop header. + distributeMass(Node, LoopHead, Dist); +} + +template +raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { + if (!F) + return OS; + OS << "block-frequency-info: " << F->getName() << "\n"; + for (const BlockT &BB : *F) + OS << " - " << bfi_detail::getBlockName(&BB) + << ": float = " << getFloatingBlockFreq(&BB) + << ", int = " << getBlockFreq(&BB).getFrequency() << "\n"; + + // Add an extra newline for readability. + OS << "\n"; + return OS; +} } #endif diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp index 39aef9e1401..13ab29a94d3 100644 --- a/lib/Analysis/BlockFrequencyInfo.cpp +++ b/lib/Analysis/BlockFrequencyInfo.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "block-freq" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/Analysis/BranchProbabilityInfo.h" @@ -106,6 +107,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis", true, true) @@ -120,14 +122,16 @@ BlockFrequencyInfo::~BlockFrequencyInfo() {} void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } bool BlockFrequencyInfo::runOnFunction(Function &F) { BranchProbabilityInfo &BPI = getAnalysis(); + LoopInfo &LI = getAnalysis(); if (!BFI) BFI.reset(new ImplType); - BFI->doFunction(&F, &BPI); + BFI->doFunction(&F, &BPI, &LI); #ifndef NDEBUG if (ViewBlockFreqPropagationDAG != GVDT_None) view(); @@ -158,7 +162,7 @@ void BlockFrequencyInfo::view() const { } const Function *BlockFrequencyInfo::getFunction() const { - return BFI ? BFI->Fn : nullptr; + return BFI ? BFI->getFunction() : nullptr; } raw_ostream &BlockFrequencyInfo:: diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp new file mode 100644 index 00000000000..f267a9cdc5c --- /dev/null +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -0,0 +1,931 @@ +//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Loops should be simplified before this analysis. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "block-freq" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// +// PositiveFloat implementation. +// +//===----------------------------------------------------------------------===// +const int PositiveFloatBase::MaxExponent; +const int PositiveFloatBase::MinExponent; + +static void appendDigit(std::string &Str, unsigned D) { + assert(D < 10); + Str += '0' + D % 10; +} + +static void appendNumber(std::string &Str, uint64_t N) { + while (N) { + appendDigit(Str, N % 10); + N /= 10; + } +} + +static bool doesRoundUp(char Digit) { + switch (Digit) { + case '5': + case '6': + case '7': + case '8': + case '9': + return true; + default: + return false; + } +} + +static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) { + assert(E >= PositiveFloatBase::MinExponent); + assert(E <= PositiveFloatBase::MaxExponent); + + // Find a new E, but don't let it increase past MaxExponent. + int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D); + int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros); + int Shift = 63 - (NewE - E); + assert(Shift <= LeadingZeros); + assert(Shift == LeadingZeros || NewE == PositiveFloatBase::MaxExponent); + D <<= Shift; + E = NewE; + + // Check for a denormal. + unsigned AdjustedE = E + 16383; + if (!(D >> 63)) { + assert(E == PositiveFloatBase::MaxExponent); + AdjustedE = 0; + } + + // Build the float and print it. + uint64_t RawBits[2] = {D, AdjustedE}; + APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits)); + SmallVector Chars; + Float.toString(Chars, Precision, 0); + return std::string(Chars.begin(), Chars.end()); +} + +static std::string stripTrailingZeros(std::string Float) { + size_t NonZero = Float.find_last_not_of('0'); + assert(NonZero != std::string::npos && "no . in floating point string"); + + if (Float[NonZero] == '.') + ++NonZero; + + return Float.substr(0, NonZero + 1); +} + +std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width, + unsigned Precision) { + if (!D) + return "0.0"; + + // Canonicalize exponent and digits. + uint64_t Above0 = 0; + uint64_t Below0 = 0; + uint64_t Extra = 0; + int ExtraShift = 0; + if (E == 0) { + Above0 = D; + } else if (E > 0) { + if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) { + D <<= Shift; + E -= Shift; + + if (!E) + Above0 = D; + } + } else if (E > -64) { + Above0 = D >> -E; + Below0 = D << (64 + E); + } else if (E > -120) { + Below0 = D >> (-E - 64); + Extra = D << (128 + E); + ExtraShift = -64 - E; + } + + // Fall back on APFloat for very small and very large numbers. + if (!Above0 && !Below0) + return toStringAPFloat(D, E, Precision); + + // Append the digits before the decimal. + std::string Str; + size_t DigitsOut = 0; + if (Above0) { + appendNumber(Str, Above0); + DigitsOut = Str.size(); + } else + appendDigit(Str, 0); + std::reverse(Str.begin(), Str.end()); + + // Return early if there's nothing after the decimal. + if (!Below0) + return Str + ".0"; + + // Append the decimal and beyond. + Str += '.'; + uint64_t Error = UINT64_C(1) << (64 - Width); + + // We need to shift Below0 to the right to make space for calculating + // digits. Save the precision we're losing in Extra. + Extra = (Below0 & 0xf) << 56 | (Extra >> 8); + Below0 >>= 4; + size_t SinceDot = 0; + size_t AfterDot = Str.size(); + do { + if (ExtraShift) { + --ExtraShift; + Error *= 5; + } else + Error *= 10; + + Below0 *= 10; + Extra *= 10; + Below0 += (Extra >> 60); + Extra = Extra & (UINT64_MAX >> 4); + appendDigit(Str, Below0 >> 60); + Below0 = Below0 & (UINT64_MAX >> 4); + if (DigitsOut || Str.back() != '0') + ++DigitsOut; + ++SinceDot; + } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 && + (!Precision || DigitsOut <= Precision || SinceDot < 2)); + + // Return early for maximum precision. + if (!Precision || DigitsOut <= Precision) + return stripTrailingZeros(Str); + + // Find where to truncate. + size_t Truncate = + std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1); + + // Check if there's anything to truncate. + if (Truncate >= Str.size()) + return stripTrailingZeros(Str); + + bool Carry = doesRoundUp(Str[Truncate]); + if (!Carry) + return stripTrailingZeros(Str.substr(0, Truncate)); + + // Round with the first truncated digit. + for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend(); + I != E; ++I) { + if (*I == '.') + continue; + if (*I == '9') { + *I = '0'; + continue; + } + + ++*I; + Carry = false; + break; + } + + // Add "1" in front if we still need to carry. + return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate)); +} + +raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E, + int Width, unsigned Precision) { + return OS << toString(D, E, Width, Precision); +} + +void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) { + print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E + << "]"; +} + +static std::pair +getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) { + if (ShouldRound) + if (!++N) + // Rounding caused an overflow. + return std::make_pair(UINT64_C(1), Shift + 64); + return std::make_pair(N, Shift); +} + +std::pair PositiveFloatBase::divide64(uint64_t Dividend, + uint64_t Divisor) { + // Input should be sanitized. + assert(Divisor); + assert(Dividend); + + // Minimize size of divisor. + int16_t Shift = 0; + if (int Zeros = countTrailingZeros(Divisor)) { + Shift -= Zeros; + Divisor >>= Zeros; + } + + // Check for powers of two. + if (Divisor == 1) + return std::make_pair(Dividend, Shift); + + // Maximize size of dividend. + if (int Zeros = countLeadingZeros64(Dividend)) { + Shift -= Zeros; + Dividend <<= Zeros; + } + + // Start with the result of a divide. + uint64_t Quotient = Dividend / Divisor; + Dividend %= Divisor; + + // Continue building the quotient with long division. + // + // TODO: continue with largers digits. + while (!(Quotient >> 63) && Dividend) { + // Shift Dividend, and check for overflow. + bool IsOverflow = Dividend >> 63; + Dividend <<= 1; + --Shift; + + // Divide. + bool DoesDivide = IsOverflow || Divisor <= Dividend; + Quotient = (Quotient << 1) | uint64_t(DoesDivide); + Dividend -= DoesDivide ? Divisor : 0; + } + + // Round. + if (Dividend >= getHalf(Divisor)) + if (!++Quotient) + // Rounding caused an overflow in Quotient. + return std::make_pair(UINT64_C(1), Shift + 64); + + return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift); +} + +static void addWithCarry(uint64_t &Upper, uint64_t &Lower, uint64_t N) { + uint64_t NewLower = Lower + (N << 32); + Upper += (N >> 32) + (NewLower < Lower); + Lower = NewLower; +} + +std::pair PositiveFloatBase::multiply64(uint64_t L, + uint64_t R) { + // Separate into two 32-bit digits (U.L). + uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX; + + // Compute cross products. + uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR; + + // Sum into two 64-bit digits. + uint64_t Upper = P1, Lower = P4; + addWithCarry(Upper, Lower, P2); + addWithCarry(Upper, Lower, P3); + + // Check for the lower 32 bits. + if (!Upper) + return std::make_pair(Lower, 0); + + // Shift as little as possible to maximize precision. + unsigned LeadingZeros = countLeadingZeros64(Upper); + int16_t Shift = 64 - LeadingZeros; + if (LeadingZeros) + Upper = Upper << LeadingZeros | Lower >> Shift; + bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1)); + return getRoundedFloat(Upper, ShouldRound, Shift); +} + +//===----------------------------------------------------------------------===// +// +// BlockMass implementation. +// +//===----------------------------------------------------------------------===// +BlockMass &BlockMass::operator*=(const BranchProbability &P) { + uint32_t N = P.getNumerator(), D = P.getDenominator(); + assert(D || "divide by 0"); + assert(N <= D || "fraction greater than 1"); + + // Fast path for multiplying by 1.0. + if (!Mass || N == D) + return *this; + + // Get as much precision as we can. + int Shift = countLeadingZeros(Mass); + uint64_t ShiftedQuotient = (Mass << Shift) / D; + uint64_t Product = ShiftedQuotient * N >> Shift; + + // Now check for what's lost. + uint64_t Left = ShiftedQuotient * (D - N) >> Shift; + uint64_t Lost = Mass - Product - Left; + + // TODO: prove this assertion. + assert(Lost <= UINT32_MAX); + + // Take the product plus a portion of the spoils. + Mass = Product + Lost * N / D; + return *this; +} + +PositiveFloat BlockMass::toFloat() const { + if (isFull()) + return PositiveFloat(1, 0); + return PositiveFloat(getMass() + 1, -64); +} + +void BlockMass::dump() const { print(dbgs()); } + +static char getHexDigit(int N) { + assert(N < 16); + if (N < 10) + return '0' + N; + return 'a' + N - 10; +} +raw_ostream &BlockMass::print(raw_ostream &OS) const { + for (int Digits = 0; Digits < 16; ++Digits) + OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); + return OS; +} + +//===----------------------------------------------------------------------===// +// +// BlockFrequencyInfoImpl implementation. +// +//===----------------------------------------------------------------------===// +namespace { + +typedef BlockFrequencyInfoImplBase::BlockNode BlockNode; +typedef BlockFrequencyInfoImplBase::Distribution Distribution; +typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList; +typedef BlockFrequencyInfoImplBase::Float Float; +typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData; +typedef BlockFrequencyInfoImplBase::Weight Weight; +typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData; + +/// \brief Dithering mass distributer. +/// +/// This class splits up a single mass into portions by weight, dithering to +/// spread out error. No mass is lost. The dithering precision depends on the +/// precision of the product of \a BlockMass and \a BranchProbability. +/// +/// The distribution algorithm follows. +/// +/// 1. Initialize by saving the sum of the weights in \a RemWeight and the +/// mass to distribute in \a RemMass. +/// +/// 2. For each portion: +/// +/// 1. Construct a branch probability, P, as the portion's weight divided +/// by the current value of \a RemWeight. +/// 2. Calculate the portion's mass as \a RemMass times P. +/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting +/// the current portion's weight and mass. +/// +/// Mass is distributed in two ways: full distribution and forward +/// distribution. The latter ignores backedges, and uses the parallel fields +/// \a RemForwardWeight and \a RemForwardMass. +struct DitheringDistributer { + uint32_t RemWeight; + uint32_t RemForwardWeight; + + BlockMass RemMass; + BlockMass RemForwardMass; + + DitheringDistributer(Distribution &Dist, const BlockMass &Mass); + + BlockMass takeLocalMass(uint32_t Weight) { + (void)takeMass(Weight); + return takeForwardMass(Weight); + } + BlockMass takeExitMass(uint32_t Weight) { + (void)takeForwardMass(Weight); + return takeMass(Weight); + } + BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); } + +private: + BlockMass takeForwardMass(uint32_t Weight); + BlockMass takeMass(uint32_t Weight); +}; +} + +DitheringDistributer::DitheringDistributer(Distribution &Dist, + const BlockMass &Mass) { + Dist.normalize(); + RemWeight = Dist.Total; + RemForwardWeight = Dist.ForwardTotal; + RemMass = Mass; + RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass(); +} + +BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) { + // Compute the amount of mass to take. + assert(Weight && "invalid weight"); + assert(Weight <= RemForwardWeight); + BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight); + + // Decrement totals (dither). + RemForwardWeight -= Weight; + RemForwardMass -= Mass; + return Mass; +} +BlockMass DitheringDistributer::takeMass(uint32_t Weight) { + assert(Weight && "invalid weight"); + assert(Weight <= RemWeight); + BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); + + // Decrement totals (dither). + RemWeight -= Weight; + RemMass -= Mass; + return Mass; +} + +void Distribution::add(const BlockNode &Node, uint64_t Amount, + Weight::DistType Type) { + assert(Amount && "invalid weight of 0"); + uint64_t NewTotal = Total + Amount; + + // Check for overflow. It should be impossible to overflow twice. + bool IsOverflow = NewTotal < Total; + assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); + DidOverflow |= IsOverflow; + + // Update the total. + Total = NewTotal; + + // Save the weight. + Weight W; + W.TargetNode = Node; + W.Amount = Amount; + W.Type = Type; + Weights.push_back(W); + + if (Type == Weight::Backedge) + return; + + // Update forward total. Don't worry about overflow here, since then Total + // will exceed 32-bits and they'll both be recomputed in normalize(). + ForwardTotal += Amount; +} + +static void combineWeight(Weight &W, const Weight &OtherW) { + assert(OtherW.TargetNode.isValid()); + if (!W.Amount) { + W = OtherW; + return; + } + assert(W.Type == OtherW.Type); + assert(W.TargetNode == OtherW.TargetNode); + assert(W.Amount < W.Amount + OtherW.Amount); + W.Amount += OtherW.Amount; +} +static void combineWeightsBySorting(WeightList &Weights) { + // Sort so edges to the same node are adjacent. + std::sort(Weights.begin(), Weights.end(), + [](const Weight &L, + const Weight &R) { return L.TargetNode < R.TargetNode; }); + + // Combine adjacent edges. + WeightList::iterator O = Weights.begin(); + for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; + ++O, (I = L)) { + *O = *I; + + // Find the adjacent weights to the same node. + for (++L; L != E && I->TargetNode == L->TargetNode; ++L) + combineWeight(*O, *L); + } + + // Erase extra entries. + Weights.erase(O, Weights.end()); + return; +} +static void combineWeightsByHashing(WeightList &Weights) { + // Collect weights into a DenseMap. + typedef DenseMap HashTable; + HashTable Combined(NextPowerOf2(2 * Weights.size())); + for (const Weight &W : Weights) + combineWeight(Combined[W.TargetNode.Index], W); + + // Check whether anything changed. + if (Weights.size() == Combined.size()) + return; + + // Fill in the new weights. + Weights.clear(); + Weights.reserve(Combined.size()); + for (const auto &I : Combined) + Weights.push_back(I.second); +} +static void combineWeights(WeightList &Weights) { + // Use a hash table for many successors to keep this linear. + if (Weights.size() > 128) { + combineWeightsByHashing(Weights); + return; + } + + combineWeightsBySorting(Weights); +} +static uint64_t shiftRightAndRound(uint64_t N, int Shift) { + assert(Shift >= 0); + assert(Shift < 64); + if (!Shift) + return N; + return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); +} +void Distribution::normalize() { + // Early exit for termination nodes. + if (Weights.empty()) + return; + + // Only bother if there are multiple successors. + if (Weights.size() > 1) + combineWeights(Weights); + + // Early exit when combined into a single successor. + if (Weights.size() == 1) { + Total = 1; + ForwardTotal = Weights.front().Type != Weight::Backedge; + Weights.front().Amount = 1; + return; + } + + // Determine how much to shift right so that the total fits into 32-bits. + // + // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 + // for each weight can cause a 32-bit overflow. + int Shift = 0; + if (DidOverflow) + Shift = 33; + else if (Total > UINT32_MAX) + Shift = 33 - countLeadingZeros(Total); + + // Early exit if nothing needs to be scaled. + if (!Shift) + return; + + // Recompute the total through accumulation (rather than shifting it) so that + // it's accurate after shifting. ForwardTotal is dirty here anyway. + Total = 0; + ForwardTotal = 0; + + // Sum the weights to each node and shift right if necessary. + for (Weight &W : Weights) { + // Scale down below UINT32_MAX. Since Shift is larger than necessary, we + // can round here without concern about overflow. + assert(W.TargetNode.isValid()); + W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); + assert(W.Amount <= UINT32_MAX); + + // Update the total. + Total += W.Amount; + if (W.Type == Weight::Backedge) + continue; + + // Update the forward total. + ForwardTotal += W.Amount; + } + assert(Total <= UINT32_MAX); +} + +void BlockFrequencyInfoImplBase::clear() { + *this = BlockFrequencyInfoImplBase(); +} + +/// \brief Clear all memory not needed downstream. +/// +/// Releases all memory not used downstream. In particular, saves Freqs. +static void cleanup(BlockFrequencyInfoImplBase &BFI) { + std::vector SavedFreqs(std::move(BFI.Freqs)); + BFI.clear(); + BFI.Freqs = std::move(SavedFreqs); +} + +/// \brief Get a possibly packaged node. +/// +/// Get the node currently representing Node, which could be a containing +/// loop. +/// +/// This function should only be called when distributing mass. As long as +/// there are no irreducilbe edges to Node, then it will have complexity O(1) +/// in this context. +/// +/// In general, the complexity is O(L), where L is the number of loop headers +/// Node has been packaged into. Since this method is called in the context +/// of distributing mass, L will be the number of loop headers an early exit +/// edge jumps out of. +static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI, + const BlockNode &Node) { + assert(Node.isValid()); + if (!BFI.Working[Node.Index].IsPackaged) + return Node; + if (!BFI.Working[Node.Index].ContainingLoop.isValid()) + return Node; + return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop); +} + +/// \brief Get the appropriate mass for a possible pseudo-node loop package. +/// +/// Get appropriate mass for Node. If Node is a loop-header (whose loop has +/// been packaged), returns the mass of its pseudo-node. If it's a node inside +/// a packaged loop, it returns the loop's pseudo-node. +static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI, + const BlockNode &Node) { + assert(Node.isValid()); + assert(!BFI.Working[Node.Index].IsPackaged); + if (!BFI.Working[Node.Index].IsAPackage) + return BFI.Working[Node.Index].Mass; + + return BFI.getLoopPackage(Node).Mass; +} + +void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, + const BlockNode &LoopHead, + const BlockNode &Pred, + const BlockNode &Succ, + uint64_t Weight) { + if (!Weight) + Weight = 1; + +#ifndef NDEBUG + auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) { + dbgs() << " =>" + << " [" << Type << "] weight = " << Weight; + if (Succ != LoopHead) + dbgs() << ", succ = " << getBlockName(Succ); + if (Resolved != Succ) + dbgs() << ", resolved = " << getBlockName(Resolved); + dbgs() << "\n"; + }; + (void)debugSuccessor; +#endif + + if (Succ == LoopHead) { + DEBUG(debugSuccessor("backedge", Succ)); + Dist.addBackedge(LoopHead, Weight); + return; + } + BlockNode Resolved = getPackagedNode(*this, Succ); + assert(Resolved != LoopHead); + + if (Working[Resolved.Index].ContainingLoop != LoopHead) { + DEBUG(debugSuccessor(" exit ", Resolved)); + Dist.addExit(Resolved, Weight); + return; + } + + if (!LoopHead.isValid() && Resolved < Pred) { + // Irreducible backedge. Skip this edge in the distribution. + DEBUG(debugSuccessor("skipped ", Resolved)); + return; + } + + DEBUG(debugSuccessor(" local ", Resolved)); + Dist.addLocal(Resolved, Weight); +} + +void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( + const BlockNode &LoopHead, const BlockNode &LocalLoopHead, + Distribution &Dist) { + PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead); + const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits; + + // Copy the exit map into Dist. + for (const auto &I : Exits) + addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass()); + + // We don't need this map any more. Clear it to prevent quadratic memory + // usage in deeply nested loops with irreducible control flow. + LoopPackage.Exits.clear(); +} + +/// \brief Get the maximum allowed loop scale. +/// +/// Gives the maximum number of estimated iterations allowed for a loop. +/// Downstream users have trouble with very large numbers (even within +/// 64-bits). Perhaps they can be changed to use PositiveFloat. +/// +/// TODO: change downstream users so that this can be increased or removed. +static Float getMaxLoopScale() { return Float(1, 12); } + +/// \brief Compute the loop scale for a loop. +void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) { + // Compute loop scale. + DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n"); + + // LoopScale == 1 / ExitMass + // ExitMass == HeadMass - BackedgeMass + PackagedLoopData &LoopPackage = getLoopPackage(LoopHead); + BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass; + + // Block scale stores the inverse of the scale. + LoopPackage.Scale = ExitMass.toFloat().inverse(); + + DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() + << " - " << LoopPackage.BackedgeMass << ")\n" + << " - scale = " << LoopPackage.Scale << "\n"); + + if (LoopPackage.Scale > getMaxLoopScale()) { + LoopPackage.Scale = getMaxLoopScale(); + DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n"); + } +} + +/// \brief Package up a loop. +void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) { + DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n"); + Working[LoopHead.Index].IsAPackage = true; + for (const BlockNode &M : getLoopPackage(LoopHead).Members) { + DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); + Working[M.Index].IsPackaged = true; + } +} + +void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, + const BlockNode &LoopHead, + Distribution &Dist) { + BlockMass Mass = getPackageMass(*this, Source); + DEBUG(dbgs() << " => mass: " << Mass + << " ( general | forward )\n"); + + // Distribute mass to successors as laid out in Dist. + DitheringDistributer D(Dist, Mass); + +#ifndef NDEBUG + auto debugAssign = [&](const BlockNode &T, const BlockMass &M, + const char *Desc) { + dbgs() << " => assign " << M << " (" << D.RemMass << "|" + << D.RemForwardMass << ")"; + if (Desc) + dbgs() << " [" << Desc << "]"; + if (T.isValid()) + dbgs() << " to " << getBlockName(T); + dbgs() << "\n"; + }; + (void)debugAssign; +#endif + + PackagedLoopData *LoopPackage = 0; + if (LoopHead.isValid()) + LoopPackage = &getLoopPackage(LoopHead); + for (const Weight &W : Dist.Weights) { + // Check for a local edge (forward and non-exit). + if (W.Type == Weight::Local) { + BlockMass Local = D.takeLocalMass(W.Amount); + getPackageMass(*this, W.TargetNode) += Local; + DEBUG(debugAssign(W.TargetNode, Local, nullptr)); + continue; + } + + // Backedges and exits only make sense if we're processing a loop. + assert(LoopPackage && "backedge or exit outside of loop"); + + // Check for a backedge. + if (W.Type == Weight::Backedge) { + BlockMass Back = D.takeBackedgeMass(W.Amount); + LoopPackage->BackedgeMass += Back; + DEBUG(debugAssign(BlockNode(), Back, "back")); + continue; + } + + // This must be an exit. + assert(W.Type == Weight::Exit); + BlockMass Exit = D.takeExitMass(W.Amount); + LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit)); + DEBUG(debugAssign(W.TargetNode, Exit, "exit")); + } +} + +static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, + const Float &Min, const Float &Max) { + // Scale the Factor to a size that creates integers. Ideally, integers would + // be scaled so that Max == UINT64_MAX so that they can be best + // differentiated. However, the register allocator currently deals poorly + // with large numbers. Instead, push Min up a little from 1 to give some + // room to differentiate small, unequal numbers. + // + // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max. + Float ScalingFactor = Min.inverse(); + if ((Max / Min).lg() < 60) + ScalingFactor <<= 3; + + // Translate the floats to integers. + DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max + << ", factor = " << ScalingFactor << "\n"); + for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { + Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor; + BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt()); + DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " + << BFI.Freqs[Index].Floating << ", scaled = " << Scaled + << ", int = " << BFI.Freqs[Index].Integer << "\n"); + } +} + +static void scaleBlockData(BlockFrequencyInfoImplBase &BFI, + const BlockNode &Node, + const PackagedLoopData &Loop) { + Float F = Loop.Mass.toFloat() * Loop.Scale; + + Float &Current = BFI.Freqs[Node.Index].Floating; + Float Updated = Current * F; + + DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => " + << Updated << "\n"); + + Current = Updated; +} + +/// \brief Unwrap a loop package. +/// +/// Visits all the members of a loop, adjusting their BlockData according to +/// the loop's pseudo-node. +static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI, + const BlockNode &Head) { + assert(Head.isValid()); + + PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head); + DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head) + << ": mass = " << LoopPackage.Mass + << ", scale = " << LoopPackage.Scale << "\n"); + scaleBlockData(BFI, Head, LoopPackage); + + // Propagate the head scale through the loop. Since members are visited in + // RPO, the head scale will be updated by the loop scale first, and then the + // final head scale will be used for updated the rest of the members. + for (const BlockNode &M : LoopPackage.Members) { + const FrequencyData &HeadData = BFI.Freqs[Head.Index]; + FrequencyData &Freqs = BFI.Freqs[M.Index]; + Float NewFreq = Freqs.Floating * HeadData.Floating; + DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating + << " => " << NewFreq << "\n"); + Freqs.Floating = NewFreq; + } +} + +void BlockFrequencyInfoImplBase::finalizeMetrics() { + // Set initial frequencies from loop-local masses. + for (size_t Index = 0; Index < Working.size(); ++Index) + Freqs[Index].Floating = Working[Index].Mass.toFloat(); + + // Unwrap loop packages in reverse post-order, tracking min and max + // frequencies. + auto Min = Float::getLargest(); + auto Max = Float::getZero(); + for (size_t Index = 0; Index < Working.size(); ++Index) { + if (Working[Index].isLoopHeader()) + unwrapLoopPackage(*this, BlockNode(Index)); + + // Update max scale. + Min = std::min(Min, Freqs[Index].Floating); + Max = std::max(Max, Freqs[Index].Floating); + } + + // Convert to integers. + convertFloatingToInteger(*this, Min, Max); + + // Clean up data structures. + cleanup(*this); + + // Print out the final stats. + DEBUG(dump()); +} + +BlockFrequency +BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { + if (!Node.isValid()) + return 0; + return Freqs[Node.Index].Integer; +} +Float +BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { + if (!Node.isValid()) + return Float::getZero(); + return Freqs[Node.Index].Floating; +} + +std::string +BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { + return std::string(); +} + +raw_ostream & +BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, + const BlockNode &Node) const { + return OS << getFloatingBlockFreq(Node); +} + +raw_ostream & +BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, + const BlockFrequency &Freq) const { + Float Block(Freq.getFrequency(), 0); + Float Entry(getEntryFreq(), 0); + + return OS << Block / Entry; +} diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index c6d45738852..0b0b2f92ea5 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMAnalysis Analysis.cpp BasicAliasAnalysis.cpp BlockFrequencyInfo.cpp + BlockFrequencyInfoImpl.cpp BranchProbabilityInfo.cpp CFG.cpp CFGPrinter.cpp diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 70efa307d57..d3ac0c0437d 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -11,9 +11,12 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "block-freq" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" @@ -112,6 +115,7 @@ struct DOTGraphTraits : INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", "Machine Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq", "Machine Block Frequency Analysis", true, true) @@ -127,16 +131,18 @@ MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {} void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { MachineBranchProbabilityInfo &MBPI = - getAnalysis(); + getAnalysis(); + MachineLoopInfo &MLI = getAnalysis(); if (!MBFI) MBFI.reset(new ImplType); - MBFI->doFunction(&F, &MBPI); + MBFI->doFunction(&F, &MBPI, &MLI); #ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None) { view(); @@ -166,7 +172,7 @@ getBlockFreq(const MachineBasicBlock *MBB) const { } const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { - return MBFI ? MBFI->Fn : nullptr; + return MBFI ? MBFI->getFunction() : nullptr; } raw_ostream & diff --git a/test/Analysis/BlockFrequencyInfo/bad_input.ll b/test/Analysis/BlockFrequencyInfo/bad_input.ll new file mode 100644 index 00000000000..bcdc1e6f0bc --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/bad_input.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +declare void @g(i32 %x) + +; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0': +; CHECK-NEXT: block-frequency-info: branch_weight_0 +define void @branch_weight_0(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + br label %for.body + +; Check that we get 1,4 instead of 0,3. +; CHECK-NEXT: for.body: float = 4.0, +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + call void @g(i32 %i) + %inc = add i32 %i, 1 + %cmp = icmp ugt i32 %inc, %a + br i1 %cmp, label %for.end, label %for.body, !prof !0 + +; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]] +for.end: + ret void +} + +!0 = metadata !{metadata !"branch_weights", i32 0, i32 3} + +; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop' +; CHECK-NEXT: block-frequency-info: infinite_loop +define void @infinite_loop(i1 %x) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + br i1 %x, label %for.body, label %for.end, !prof !1 + +; Check that the loop scale maxes out at 4096, giving 2048 here. +; CHECK-NEXT: for.body: float = 2048.0, +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + call void @g(i32 %i) + %inc = add i32 %i, 1 + br label %for.body + +; Check that the exit weight is half of entry, since half is lost in the +; infinite loop above. +; CHECK-NEXT: for.end: float = 0.5, +for.end: + ret void +} + +!1 = metadata !{metadata !"branch_weights", i32 1, i32 1} diff --git a/test/Analysis/BlockFrequencyInfo/basic.ll b/test/Analysis/BlockFrequencyInfo/basic.ll index ce29fb5ce1b..006e6ab4d74 100644 --- a/test/Analysis/BlockFrequencyInfo/basic.ll +++ b/test/Analysis/BlockFrequencyInfo/basic.ll @@ -1,13 +1,14 @@ ; RUN: opt < %s -analyze -block-freq | FileCheck %s define i32 @test1(i32 %i, i32* %a) { -; CHECK: Printing analysis {{.*}} for function 'test1' -; CHECK: entry = 1.0 +; CHECK-LABEL: Printing analysis {{.*}} for function 'test1': +; CHECK-NEXT: block-frequency-info: test1 +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] entry: br label %body ; Loop backedges are weighted and thus their bodies have a greater frequency. -; CHECK: body = 32.0 +; CHECK-NEXT: body: float = 32.0, body: %iv = phi i32 [ 0, %entry ], [ %next, %body ] %base = phi i32 [ 0, %entry ], [ %sum, %body ] @@ -18,29 +19,29 @@ body: %exitcond = icmp eq i32 %next, %i br i1 %exitcond, label %exit, label %body -; CHECK: exit = 1.0 +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] exit: ret i32 %sum } define i32 @test2(i32 %i, i32 %a, i32 %b) { -; CHECK: Printing analysis {{.*}} for function 'test2' -; CHECK: entry = 1.0 +; CHECK-LABEL: Printing analysis {{.*}} for function 'test2': +; CHECK-NEXT: block-frequency-info: test2 +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] entry: %cond = icmp ult i32 %i, 42 br i1 %cond, label %then, label %else, !prof !0 ; The 'then' branch is predicted more likely via branch weight metadata. -; CHECK: then = 0.94116 +; CHECK-NEXT: then: float = 0.9411{{[0-9]*}}, then: br label %exit -; CHECK: else = 0.05877 +; CHECK-NEXT: else: float = 0.05882{{[0-9]*}}, else: br label %exit -; FIXME: It may be a bug that we don't sum back to 1.0. -; CHECK: exit = 0.99993 +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] exit: %result = phi i32 [ %a, %then ], [ %b, %else ] ret i32 %result @@ -49,37 +50,37 @@ exit: !0 = metadata !{metadata !"branch_weights", i32 64, i32 4} define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { -; CHECK: Printing analysis {{.*}} for function 'test3' -; CHECK: entry = 1.0 +; CHECK-LABEL: Printing analysis {{.*}} for function 'test3': +; CHECK-NEXT: block-frequency-info: test3 +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] entry: switch i32 %i, label %case_a [ i32 1, label %case_b i32 2, label %case_c i32 3, label %case_d i32 4, label %case_e ], !prof !1 -; CHECK: case_a = 0.04998 +; CHECK-NEXT: case_a: float = 0.05, case_a: br label %exit -; CHECK: case_b = 0.04998 +; CHECK-NEXT: case_b: float = 0.05, case_b: br label %exit ; The 'case_c' branch is predicted more likely via branch weight metadata. -; CHECK: case_c = 0.79998 +; CHECK-NEXT: case_c: float = 0.8, case_c: br label %exit -; CHECK: case_d = 0.04998 +; CHECK-NEXT: case_d: float = 0.05, case_d: br label %exit -; CHECK: case_e = 0.04998 +; CHECK-NEXT: case_e: float = 0.05, case_e: br label %exit -; FIXME: It may be a bug that we don't sum back to 1.0. -; CHECK: exit = 0.99993 +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] exit: %result = phi i32 [ %a, %case_a ], [ %b, %case_b ], @@ -91,44 +92,50 @@ exit: !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4} -; CHECK: Printing analysis {{.*}} for function 'nested_loops' -; CHECK: entry = 1.0 -; This test doesn't seem to be assigning sensible frequencies to nested loops. define void @nested_loops(i32 %a) { +; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops': +; CHECK-NEXT: block-frequency-info: nested_loops +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] entry: br label %for.cond1.preheader +; CHECK-NEXT: for.cond1.preheader: float = 4001.0, for.cond1.preheader: %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ] br label %for.cond4.preheader +; CHECK-NEXT: for.cond4.preheader: float = 16008001.0, for.cond4.preheader: %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ] %add = add i32 %y.023, %x.024 br label %for.body6 +; CHECK-NEXT: for.body6: float = 64048012001.0, for.body6: %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] %add7 = add i32 %add, %z.022 - tail call void @g(i32 %add7) #2 + tail call void @g(i32 %add7) %inc = add i32 %z.022, 1 %cmp5 = icmp ugt i32 %inc, %a br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2 +; CHECK-NEXT: for.inc8: float = 16008001.0, for.inc8: %inc9 = add i32 %y.023, 1 %cmp2 = icmp ugt i32 %inc9, %a br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2 +; CHECK-NEXT: for.inc11: float = 4001.0, for.inc11: %inc12 = add i32 %x.024, 1 %cmp = icmp ugt i32 %inc12, %a br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2 +; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]] for.end13: ret void } -declare void @g(i32) #1 +declare void @g(i32) !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000} diff --git a/test/Analysis/BlockFrequencyInfo/double_exit.ll b/test/Analysis/BlockFrequencyInfo/double_exit.ll new file mode 100644 index 00000000000..2fe617c9f55 --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/double_exit.ll @@ -0,0 +1,165 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit': +; CHECK-NEXT: block-frequency-info: double_exit +define i32 @double_exit(i32 %N) { +; Mass = 1 +; Frequency = 1 +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + br label %outer + +; Mass = 1 +; Backedge mass = 1/3, exit mass = 2/3 +; Loop scale = 3/2 +; Psuedo-edges = exit +; Psuedo-mass = 1 +; Frequency = 1*3/2*1 = 3/2 +; CHECK-NEXT: outer: float = 1.5, +outer: + %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ] + %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ] + %cmp = icmp slt i32 %I.0, %N + br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1 + +; Mass = 1 +; Backedge mass = 3/5, exit mass = 2/5 +; Loop scale = 5/2 +; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5 +; Pseudo-mass = 2/3 +; Frequency = 3/2*1*5/2*2/3 = 5/2 +; CHECK-NEXT: inner: float = 2.5, +inner: + %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ] + %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ] + %cmp2 = icmp slt i32 %J.0, %N + br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1 + +; Mass = 4/5 +; Frequency = 5/2*4/5 = 2 +; CHECK-NEXT: inner.body: float = 2.0, +inner.body: + %call = call i32 @c2(i32 %I.0, i32 %J.0) + %tobool = icmp ne i32 %call, 0 + br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1 + +; Mass = 3/5 +; Frequency = 5/2*3/5 = 3/2 +; CHECK-NEXT: inner.inc: float = 1.5, +inner.inc: + %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0) + %inc = add nsw i32 %J.0, 1 + br label %inner + +; Mass = 1/3 +; Frequency = 3/2*1/3 = 1/2 +; CHECK-NEXT: outer.inc: float = 0.5, +outer.inc: + %inc6 = add nsw i32 %I.0, 1 + br label %outer + +; Mass = 1 +; Frequency = 1 +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ] + ret i32 %Return.2 +} + +!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} +!1 = metadata !{metadata !"branch_weights", i32 4, i32 1} +!2 = metadata !{metadata !"branch_weights", i32 2, i32 1} + +declare i32 @c2(i32, i32) +declare i32 @logic2(i32, i32, i32) + +; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop': +; CHECK-NEXT: block-frequency-info: double_exit_in_loop +define i32 @double_exit_in_loop(i32 %N) { +; Mass = 1 +; Frequency = 1 +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + br label %outer + +; Mass = 1 +; Backedge mass = 1/2, exit mass = 1/2 +; Loop scale = 2 +; Pseudo-edges = exit +; Psuedo-mass = 1 +; Frequency = 1*2*1 = 2 +; CHECK-NEXT: outer: float = 2.0, +outer: + %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ] + %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ] + %cmp = icmp slt i32 %I.0, %N + br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1 + +; Mass = 1 +; Backedge mass = 1/3, exit mass = 2/3 +; Loop scale = 3/2 +; Psuedo-edges = outer.inc +; Psuedo-mass = 1/2 +; Frequency = 2*1*3/2*1/2 = 3/2 +; CHECK-NEXT: middle: float = 1.5, +middle: + %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ] + %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ] + %cmp2 = icmp slt i32 %J.0, %N + br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1 + +; Mass = 1 +; Backedge mass = 3/5, exit mass = 2/5 +; Loop scale = 5/2 +; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5 +; Pseudo-mass = 2/3 +; Frequency = 3/2*1*5/2*2/3 = 5/2 +; CHECK-NEXT: inner: float = 2.5, +inner: + %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ] + %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ] + %cmp5 = icmp slt i32 %K.0, %N + br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1 + +; Mass = 4/5 +; Frequency = 5/2*4/5 = 2 +; CHECK-NEXT: inner.body: float = 2.0, +inner.body: + %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0) + %tobool = icmp ne i32 %call, 0 + br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1 + +; Mass = 3/5 +; Frequency = 5/2*3/5 = 3/2 +; CHECK-NEXT: inner.inc: float = 1.5, +inner.inc: + %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0) + %inc = add nsw i32 %K.0, 1 + br label %inner + +; Mass = 1/3 +; Frequency = 3/2*1/3 = 1/2 +; CHECK-NEXT: middle.inc: float = 0.5, +middle.inc: + %inc9 = add nsw i32 %J.0, 1 + br label %middle + +; Mass = 1/2 +; Frequency = 2*1/2 = 1 +; CHECK-NEXT: outer.inc: float = 1.0, +outer.inc: + %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ] + %inc12 = add nsw i32 %I.0, 1 + br label %outer + +; Mass = 1 +; Frequency = 1 +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + ret i32 %Return.0 +} + +!3 = metadata !{metadata !"branch_weights", i32 1, i32 1} + +declare i32 @c3(i32, i32, i32) +declare i32 @logic3(i32, i32, i32, i32) diff --git a/test/Analysis/BlockFrequencyInfo/irreducible.ll b/test/Analysis/BlockFrequencyInfo/irreducible.ll new file mode 100644 index 00000000000..46a2958700e --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/irreducible.ll @@ -0,0 +1,197 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +; A loop with multiple exits should be handled correctly. +; +; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit': +; CHECK-NEXT: block-frequency-info: multiexit +define void @multiexit(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + br label %loop.1 + +; CHECK-NEXT: loop.1: float = 1.333{{3*}}, +loop.1: + %i = phi i32 [ 0, %entry ], [ %inc.2, %loop.2 ] + call void @f(i32 %i) + %inc.1 = add i32 %i, 1 + %cmp.1 = icmp ugt i32 %inc.1, %a + br i1 %cmp.1, label %exit.1, label %loop.2, !prof !0 + +; CHECK-NEXT: loop.2: float = 0.666{{6*7}}, +loop.2: + call void @g(i32 %inc.1) + %inc.2 = add i32 %inc.1, 1 + %cmp.2 = icmp ugt i32 %inc.2, %a + br i1 %cmp.2, label %exit.2, label %loop.1, !prof !1 + +; CHECK-NEXT: exit.1: float = 0.666{{6*7}}, +exit.1: + call void @h(i32 %inc.1) + br label %return + +; CHECK-NEXT: exit.2: float = 0.333{{3*}}, +exit.2: + call void @i(i32 %inc.2) + br label %return + +; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]] +return: + ret void +} + +declare void @f(i32 %x) +declare void @g(i32 %x) +declare void @h(i32 %x) +declare void @i(i32 %x) + +!0 = metadata !{metadata !"branch_weights", i32 3, i32 3} +!1 = metadata !{metadata !"branch_weights", i32 5, i32 5} + +; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances +; into a loop very well. The frequencies assigned to blocks in the loop are +; predictable (and not absurd), but also not correct and therefore not worth +; testing. +; +; There are two testcases below. +; +; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the +; granularity of a single check. If/when this behaviour is fixed, we'll know +; about it, and the test should be updated. +; +; Testcase #1 +; =========== +; +; In this case c1 and c2 should have frequencies of 15/7 and 13/7, +; respectively. To calculate this, consider assigning 1.0 to entry, and +; distributing frequency iteratively (to infinity). At the first iteration, +; entry gives 3/4 to c1 and 1/4 to c2. At every step after, c1 and c2 give 3/4 +; of what they have to each other. Somehow, all of it comes out to exit. +; +; c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ... +; c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ... +; +; Simplify by splitting up the odd and even terms of the series and taking out +; factors so that the infite series matches: +; +; c1 = 3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; + 3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; c2 = 1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; + 9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; +; c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...) +; +; Since this geometric series sums to 16/7: +; +; c1 = 15/7 +; c2 = 13/7 +; +; If we treat c1 and c2 as members of the same loop, the exit frequency of the +; loop as a whole is 1/4, so the loop scale should be 4. Summing c1 and c2 +; gives 28/7, or 4.0, which is nice confirmation of the math above. +; +; However, assuming c1 precedes c2 in reverse post-order, the current algorithm +; returns 3/4 and 13/16, respectively. LoopInfo ignores edges between loops +; (and doesn't see any loops here at all), and -block-freq ignores the +; irreducible edge from c2 to c1. +; +; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry': +; CHECK-NEXT: block-frequency-info: multientry +define void @multientry(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + %choose = call i32 @choose(i32 %a) + %compare = icmp ugt i32 %choose, %a + br i1 %compare, label %c1, label %c2, !prof !2 + +; This is like a single-line XFAIL (see above). +; CHECK-NEXT: c1: +; CHECK-NOT: float = 2.142857{{[0-9]*}}, +c1: + %i1 = phi i32 [ %a, %entry ], [ %i2.inc, %c2 ] + %i1.inc = add i32 %i1, 1 + %choose1 = call i32 @choose(i32 %i1) + %compare1 = icmp ugt i32 %choose1, %a + br i1 %compare1, label %c2, label %exit, !prof !2 + +; This is like a single-line XFAIL (see above). +; CHECK-NEXT: c2: +; CHECK-NOT: float = 1.857142{{[0-9]*}}, +c2: + %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ] + %i2.inc = add i32 %i2, 1 + %choose2 = call i32 @choose(i32 %i2) + %compare2 = icmp ugt i32 %choose2, %a + br i1 %compare2, label %c1, label %exit, !prof !2 + +; We still shouldn't lose any frequency. +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + ret void +} + +; Testcase #2 +; =========== +; +; In this case c1 and c2 should be treated as equals in a single loop. The +; exit frequency is 1/3, so the scaling factor for the loop should be 3.0. The +; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency +; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2). +; Another way of computing this result is by assigning 1.0 to entry and showing +; that c1 and c2 should accumulate frequencies of: +; +; 1/3 + 2/9 + 4/27 + 8/81 + ... +; 2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ... +; +; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent +; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This +; infinite series sums to 1. +; +; However, assuming c1 precedes c2 in reverse post-order, the current algorithm +; returns 1/2 and 3/4, respectively. LoopInfo ignores edges between loops (and +; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible +; edge from c2 to c1. +; +; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a +; single check. If/when this behaviour is fixed, we'll know about it, and the +; test should be updated. +; +; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops': +; CHECK-NEXT: block-frequency-info: crossloops +define void @crossloops(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + %choose = call i32 @choose(i32 %a) + switch i32 %choose, label %exit [ i32 1, label %c1 + i32 2, label %c2 ], !prof !3 + +; This is like a single-line XFAIL (see above). +; CHECK-NEXT: c1: +; CHECK-NOT: float = 1.0, +c1: + %i1 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ] + %i1.inc = add i32 %i1, 1 + %choose1 = call i32 @choose(i32 %i1) + switch i32 %choose1, label %exit [ i32 1, label %c1 + i32 2, label %c2 ], !prof !3 + +; This is like a single-line XFAIL (see above). +; CHECK-NEXT: c2: +; CHECK-NOT: float = 1.0, +c2: + %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ] + %i2.inc = add i32 %i2, 1 + %choose2 = call i32 @choose(i32 %i2) + switch i32 %choose2, label %exit [ i32 1, label %c1 + i32 2, label %c2 ], !prof !3 + +; We still shouldn't lose any frequency. +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + ret void +} + +declare i32 @choose(i32) + +!2 = metadata !{metadata !"branch_weights", i32 3, i32 1} +!3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2} diff --git a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll new file mode 100644 index 00000000000..9d27b6bf0f2 --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch': +; CHECK-NEXT: block-frequency-info: loop_with_branch +define void @loop_with_branch(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + %skip_loop = call i1 @foo0(i32 %a) + br i1 %skip_loop, label %skip, label %header, !prof !0 + +; CHECK-NEXT: skip: float = 0.25, +skip: + br label %exit + +; CHECK-NEXT: header: float = 4.5, +header: + %i = phi i32 [ 0, %entry ], [ %i.next, %back ] + %i.next = add i32 %i, 1 + %choose = call i2 @foo1(i32 %i) + switch i2 %choose, label %exit [ i2 0, label %left + i2 1, label %right ], !prof !1 + +; CHECK-NEXT: left: float = 1.5, +left: + br label %back + +; CHECK-NEXT: right: float = 2.25, +right: + br label %back + +; CHECK-NEXT: back: float = 3.75, +back: + br label %header + +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + ret void +} + +declare i1 @foo0(i32) +declare i2 @foo1(i32) + +!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} +!1 = metadata !{metadata !"branch_weights", i32 1, i32 2, i32 3} diff --git a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll new file mode 100644 index 00000000000..d93ffceb5fa --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll @@ -0,0 +1,59 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loop_with_branches' +; CHECK-NEXT: block-frequency-info: nested_loop_with_branches +define void @nested_loop_with_branches(i32 %a) { +; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]] +entry: + %v0 = call i1 @foo0(i32 %a) + br i1 %v0, label %exit, label %outer, !prof !0 + +; CHECK-NEXT: outer: float = 12.0, +outer: + %i = phi i32 [ 0, %entry ], [ %i.next, %inner.end ], [ %i.next, %no_inner ] + %i.next = add i32 %i, 1 + %do_inner = call i1 @foo1(i32 %i) + br i1 %do_inner, label %no_inner, label %inner, !prof !0 + +; CHECK-NEXT: inner: float = 36.0, +inner: + %j = phi i32 [ 0, %outer ], [ %j.next, %inner.end ] + %side = call i1 @foo3(i32 %j) + br i1 %side, label %left, label %right, !prof !0 + +; CHECK-NEXT: left: float = 9.0, +left: + %v4 = call i1 @foo4(i32 %j) + br label %inner.end + +; CHECK-NEXT: right: float = 27.0, +right: + %v5 = call i1 @foo5(i32 %j) + br label %inner.end + +; CHECK-NEXT: inner.end: float = 36.0, +inner.end: + %stay_inner = phi i1 [ %v4, %left ], [ %v5, %right ] + %j.next = add i32 %j, 1 + br i1 %stay_inner, label %inner, label %outer, !prof !1 + +; CHECK-NEXT: no_inner: float = 3.0, +no_inner: + %continue = call i1 @foo6(i32 %i) + br i1 %continue, label %outer, label %exit, !prof !1 + +; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]] +exit: + ret void +} + +declare i1 @foo0(i32) +declare i1 @foo1(i32) +declare i1 @foo2(i32) +declare i1 @foo3(i32) +declare i1 @foo4(i32) +declare i1 @foo5(i32) +declare i1 @foo6(i32) + +!0 = metadata !{metadata !"branch_weights", i32 1, i32 3} +!1 = metadata !{metadata !"branch_weights", i32 3, i32 1} diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll index e0acd66e4a6..b436282615c 100644 --- a/test/CodeGen/XCore/llvm-intrinsics.ll +++ b/test/CodeGen/XCore/llvm-intrinsics.ll @@ -287,9 +287,8 @@ define void @Unwind1() { ; CHECKFP: .LBB{{[0-9_]+}} ; CHECKFP-NEXT: ldc r2, 40 ; CHECKFP-NEXT: add r2, r10, r2 -; CHECKFP-NEXT: add r0, r2, r0 +; CHECKFP-NEXT: add r2, r2, r0 ; CHECKFP-NEXT: mov r3, r1 -; CHECKFP-NEXT: mov r2, r0 ; CHECKFP-NEXT: ldw r9, r10[4] ; CHECKFP-NEXT: ldw r8, r10[5] ; CHECKFP-NEXT: ldw r7, r10[6] @@ -337,9 +336,8 @@ define void @Unwind1() { ; CHECK-NEXT: ldc r2, 36 ; CHECK-NEXT: ldaw r3, sp[0] ; CHECK-NEXT: add r2, r3, r2 -; CHECK-NEXT: add r0, r2, r0 +; CHECK-NEXT: add r2, r2, r0 ; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: ldw r10, sp[2] ; CHECK-NEXT: ldw r9, sp[3] ; CHECK-NEXT: ldw r8, sp[4]