mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-03 13:31:05 +00:00
e7a1e3ee82
This change is the first in a series of changes improving LLVM's Block Frequency propogation implementation to not lose probability mass in branchy code when propogating block frequency information from a basic block to its successors. This patch is a simple infrastructure improvement that does not actually modify the block frequency algorithm. The specific changes are: 1. Changes the division algorithm used when scaling block frequencies by branch probabilities to a short division algorithm. This gives us the remainder for free as well as provides a nice speed boost. When I benched the old routine and the new routine on a Sandy Bridge iMac with disabled turbo mode performing 8192 iterations on an array of length 32768, I saw ~600% increase in speed in mean/median performance. 2. Exposes a scale method that returns a remainder. This is important so we can ensure that when we scale a block frequency by some branch probability BP = N/D, the remainder from the division by D can be retrieved and propagated to other children to ensure no probability mass is lost (more to come on this). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194950 91177308-0d34-0410-b5e6-96231b3b80d8
173 lines
4.8 KiB
C++
173 lines
4.8 KiB
C++
//====--------------- lib/Support/BlockFrequency.cpp -----------*- C++ -*-====//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements Block Frequency class.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Support/BranchProbability.h"
|
|
#include "llvm/Support/BlockFrequency.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include <cassert>
|
|
|
|
using namespace llvm;
|
|
|
|
/// Multiply FREQ by N and store result in W array.
|
|
static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
|
|
uint64_t u0 = freq & UINT32_MAX;
|
|
uint64_t u1 = freq >> 32;
|
|
|
|
// Represent 96-bit value as W[2]:W[1]:W[0];
|
|
uint64_t t = u0 * N;
|
|
uint64_t k = t >> 32;
|
|
W[0] = t;
|
|
t = u1 * N + k;
|
|
W[1] = t;
|
|
W[2] = t >> 32;
|
|
}
|
|
|
|
/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
|
|
/// 32 bit unsigned integer, we can use a short division algorithm.
|
|
static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
|
|
// We assume that W[2] is non-zero since if W[2] is not then the user should
|
|
// just use hardware division.
|
|
assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
|
|
"zero, the caller should just use 64/32 hardware.");
|
|
uint32_t Q[3] = { 0, 0, 0 };
|
|
|
|
// The generalized short division algorithm sets i to m + n - 1, where n is
|
|
// the number of words in the divisior and m is the number of words by which
|
|
// the divident exceeds the divisor (i.e. m + n == the length of the dividend
|
|
// in words). Due to our assumption that W[2] is non-zero, we know that the
|
|
// dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
|
|
// m + n - 1 = 2 + 1 - 1 = 2.
|
|
uint32_t R = 0;
|
|
for (int i = 2; i >= 0; --i) {
|
|
uint64_t PartialD = uint64_t(R) << 32 | W[i];
|
|
if (PartialD == 0) {
|
|
Q[i] = 0;
|
|
R = 0;
|
|
} else if (PartialD < D) {
|
|
Q[i] = 0;
|
|
R = uint32_t(PartialD);
|
|
} else if (PartialD == D) {
|
|
Q[i] = 1;
|
|
R = 0;
|
|
} else {
|
|
Q[i] = uint32_t(PartialD / D);
|
|
R = uint32_t(PartialD - (Q[i] * D));
|
|
}
|
|
}
|
|
|
|
// If Q[2] is non-zero, then we overflowed.
|
|
uint64_t Result;
|
|
if (Q[2]) {
|
|
Result = UINT64_MAX;
|
|
R = D;
|
|
} else {
|
|
// Form the final uint64_t result, avoiding endianness issues.
|
|
Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
|
|
}
|
|
|
|
if (Rout)
|
|
*Rout = R;
|
|
|
|
return Result;
|
|
}
|
|
|
|
uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
|
|
assert(D != 0 && "Division by zero");
|
|
|
|
// Calculate Frequency * N.
|
|
uint64_t MulLo = (Frequency & UINT32_MAX) * N;
|
|
uint64_t MulHi = (Frequency >> 32) * N;
|
|
uint64_t MulRes = (MulHi << 32) + MulLo;
|
|
|
|
// If the product fits in 64 bits, just use built-in division.
|
|
if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
|
|
Frequency = MulRes / D;
|
|
return MulRes % D;
|
|
}
|
|
|
|
// Product overflowed, use 96-bit operations.
|
|
// 96-bit value represented as W[2]:W[1]:W[0].
|
|
uint32_t W[3];
|
|
uint32_t R;
|
|
mult96bit(Frequency, N, W);
|
|
Frequency = divrem96bit(W, D, &R);
|
|
return R;
|
|
}
|
|
|
|
BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
|
|
scale(Prob.getNumerator(), Prob.getDenominator());
|
|
return *this;
|
|
}
|
|
|
|
const BlockFrequency
|
|
BlockFrequency::operator*(const BranchProbability &Prob) const {
|
|
BlockFrequency Freq(Frequency);
|
|
Freq *= Prob;
|
|
return Freq;
|
|
}
|
|
|
|
BlockFrequency &BlockFrequency::operator/=(const BranchProbability &Prob) {
|
|
scale(Prob.getDenominator(), Prob.getNumerator());
|
|
return *this;
|
|
}
|
|
|
|
BlockFrequency BlockFrequency::operator/(const BranchProbability &Prob) const {
|
|
BlockFrequency Freq(Frequency);
|
|
Freq /= Prob;
|
|
return Freq;
|
|
}
|
|
|
|
BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
|
|
uint64_t Before = Freq.Frequency;
|
|
Frequency += Freq.Frequency;
|
|
|
|
// If overflow, set frequency to the maximum value.
|
|
if (Frequency < Before)
|
|
Frequency = UINT64_MAX;
|
|
|
|
return *this;
|
|
}
|
|
|
|
const BlockFrequency
|
|
BlockFrequency::operator+(const BlockFrequency &Prob) const {
|
|
BlockFrequency Freq(Frequency);
|
|
Freq += Prob;
|
|
return Freq;
|
|
}
|
|
|
|
uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
|
|
return scale(Prob.getNumerator(), Prob.getDenominator());
|
|
}
|
|
|
|
void BlockFrequency::print(raw_ostream &OS) const {
|
|
// Convert fixed-point number to decimal.
|
|
OS << Frequency / getEntryFrequency() << ".";
|
|
uint64_t Rem = Frequency % getEntryFrequency();
|
|
uint64_t Eps = 1;
|
|
do {
|
|
Rem *= 10;
|
|
Eps *= 10;
|
|
OS << Rem / getEntryFrequency();
|
|
Rem = Rem % getEntryFrequency();
|
|
} while (Rem >= Eps/2);
|
|
}
|
|
|
|
namespace llvm {
|
|
|
|
raw_ostream &operator<<(raw_ostream &OS, const BlockFrequency &Freq) {
|
|
Freq.print(OS);
|
|
return OS;
|
|
}
|
|
|
|
}
|