llvm-6502/include/llvm/Analysis/BlockFrequencyInfoImpl.h

400 lines
12 KiB
C
Raw Normal View History

//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Shared implementation of BlockFrequencyInfo for IR and Machine Instructions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <string>
#include <vector>
namespace llvm {
class BranchProbabilityInfo;
class BlockFrequencyInfo;
class MachineBranchProbabilityInfo;
class MachineBlockFrequencyInfo;
namespace bfi_detail {
template <class BlockT> struct TypeMap {};
template <> struct TypeMap<BasicBlock> {
typedef BasicBlock BlockT;
typedef Function FunctionT;
typedef BranchProbabilityInfo BranchProbabilityInfoT;
};
template <> struct TypeMap<MachineBasicBlock> {
typedef MachineBasicBlock BlockT;
typedef MachineFunction FunctionT;
typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
};
}
/// BlockFrequencyInfoImpl implements block frequency algorithm for IR and
/// Machine Instructions. Algorithm starts with value ENTRY_FREQ
/// for the entry block and then propagates frequencies using branch weights
/// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
/// algorithm can find "backedges" by itself.
template <class BT>
class BlockFrequencyInfoImpl {
typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT
BranchProbabilityInfoT;
DenseMap<const BlockT *, BlockFrequency> Freqs;
BranchProbabilityInfoT *BPI;
FunctionT *Fn;
typedef GraphTraits< Inverse<BlockT *> > GT;
static const uint64_t EntryFreq = 1 << 14;
std::string getBlockName(BasicBlock *BB) const {
return BB->getName().str();
}
std::string getBlockName(MachineBasicBlock *MBB) const {
std::string str;
raw_string_ostream ss(str);
ss << "BB#" << MBB->getNumber();
if (const BasicBlock *BB = MBB->getBasicBlock())
ss << " derived from LLVM BB " << BB->getName();
return ss.str();
}
void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
Freqs[BB] = Freq;
DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
printBlockFreq(dbgs(), Freq) << "\n");
}
/// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
/// edge probability.
BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
return getBlockFreq(Src) * Prob;
}
/// incBlockFreq - Increase BB block frequency by FREQ.
///
void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
Freqs[BB] += Freq;
DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
printBlockFreq(dbgs(), Freq) << " --> ";
printBlockFreq(dbgs(), Freqs[BB]) << "\n");
}
// All blocks in postorder.
std::vector<BlockT *> POT;
// Map Block -> Position in reverse-postorder list.
DenseMap<BlockT *, unsigned> RPO;
// For each loop header, record the per-iteration probability of exiting the
// loop. This is the reciprocal of the expected number of loop iterations.
typedef DenseMap<BlockT*, BranchProbability> LoopExitProbMap;
LoopExitProbMap LoopExitProb;
// (reverse-)postorder traversal iterators.
typedef typename std::vector<BlockT *>::iterator pot_iterator;
typedef typename std::vector<BlockT *>::reverse_iterator rpot_iterator;
pot_iterator pot_begin() { return POT.begin(); }
pot_iterator pot_end() { return POT.end(); }
rpot_iterator rpot_begin() { return POT.rbegin(); }
rpot_iterator rpot_end() { return POT.rend(); }
rpot_iterator rpot_at(BlockT *BB) {
rpot_iterator I = rpot_begin();
unsigned idx = RPO.lookup(BB);
assert(idx);
std::advance(I, idx - 1);
assert(*I == BB);
return I;
}
/// isBackedge - Return if edge Src -> Dst is a reachable backedge.
///
bool isBackedge(BlockT *Src, BlockT *Dst) const {
unsigned a = RPO.lookup(Src);
if (!a)
return false;
unsigned b = RPO.lookup(Dst);
assert(b && "Destination block should be reachable");
return a >= b;
}
/// getSingleBlockPred - return single BB block predecessor or NULL if
/// BB has none or more predecessors.
BlockT *getSingleBlockPred(BlockT *BB) {
typename GT::ChildIteratorType
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
if (PI == PE)
return nullptr;
BlockT *Pred = *PI;
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
++PI;
if (PI != PE)
return nullptr;
return Pred;
}
void doBlock(BlockT *BB, BlockT *LoopHead,
SmallPtrSet<BlockT *, 8> &BlocksInLoop) {
DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
setBlockFreq(BB, 0);
if (BB == LoopHead) {
setBlockFreq(BB, EntryFreq);
return;
}
if (BlockT *Pred = getSingleBlockPred(BB)) {
if (BlocksInLoop.count(Pred))
setBlockFreq(BB, getEdgeFreq(Pred, BB));
// TODO: else? irreducible, ignore it for now.
return;
}
bool isInLoop = false;
bool isLoopHead = false;
for (typename GT::ChildIteratorType
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
PI != PE; ++PI) {
BlockT *Pred = *PI;
if (isBackedge(Pred, BB)) {
isLoopHead = true;
} else if (BlocksInLoop.count(Pred)) {
incBlockFreq(BB, getEdgeFreq(Pred, BB));
isInLoop = true;
}
// TODO: else? irreducible.
}
if (!isInLoop)
return;
if (!isLoopHead)
return;
// This block is a loop header, so boost its frequency by the expected
// number of loop iterations. The loop blocks will be revisited so they all
// get this boost.
typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
assert(I != LoopExitProb.end() && "Loop header missing from table");
Freqs[BB] /= I->second;
DEBUG(dbgs() << "Loop header scaled to ";
printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
}
/// doLoop - Propagate block frequency down through the loop.
void doLoop(BlockT *Head, BlockT *Tail) {
DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
<< getBlockName(Tail) << ")\n");
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
SmallPtrSet<BlockT *, 8> BlocksInLoop;
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
BlockT *BB = *I;
doBlock(BB, Head, BlocksInLoop);
BlocksInLoop.insert(BB);
if (I == E)
break;
}
// Compute loop's cyclic probability using backedges probabilities.
BlockFrequency BackFreq;
for (typename GT::ChildIteratorType
PI = GraphTraits< Inverse<BlockT *> >::child_begin(Head),
PE = GraphTraits< Inverse<BlockT *> >::child_end(Head);
PI != PE; ++PI) {
BlockT *Pred = *PI;
assert(Pred);
if (isBackedge(Pred, Head))
BackFreq += getEdgeFreq(Pred, Head);
}
// The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
// only counts edges entering the loop, not the loop backedges.
// The probability of leaving the loop on each iteration is:
//
// ExitProb = 1 - CyclicProb
//
// The Expected number of loop iterations is:
//
// Iterations = 1 / ExitProb
//
uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
if (N < D)
N = D - N;
else
// We'd expect N < D, but rounding and saturation means that can't be
// guaranteed.
N = 1;
// Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
assert(N <= D);
if (D > UINT32_MAX) {
unsigned Shift = 32 - countLeadingZeros(D);
D >>= Shift;
N >>= Shift;
if (N == 0)
N = 1;
}
BranchProbability LEP = BranchProbability(N, D);
LoopExitProb.insert(std::make_pair(Head, LEP));
DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
<< " from 1 - ";
printBlockFreq(dbgs(), BackFreq) << " / ";
printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
}
friend class BlockFrequencyInfo;
friend class MachineBlockFrequencyInfo;
BlockFrequencyInfoImpl() { }
void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) {
Fn = fn;
BPI = bpi;
// Clear everything.
RPO.clear();
POT.clear();
LoopExitProb.clear();
Freqs.clear();
BlockT *EntryBlock = fn->begin();
std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
unsigned RPOidx = 0;
for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
BlockT *BB = *I;
RPO[BB] = ++RPOidx;
DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
}
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
// Travel over all blocks in postorder.
for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
BlockT *BB = *I;
BlockT *LastTail = nullptr;
DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
for (typename GT::ChildIteratorType
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
PI != PE; ++PI) {
BlockT *Pred = *PI;
if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
LastTail = Pred;
}
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
if (LastTail)
doLoop(BB, LastTail);
}
// At the end assume the whole function as a loop, and travel over it once
// again.
doLoop(*(rpot_begin()), *(pot_begin()));
}
public:
uint64_t getEntryFreq() { return EntryFreq; }
/// getBlockFreq - Return block frequency. Return 0 if we don't have it.
BlockFrequency getBlockFreq(const BlockT *BB) const {
typename DenseMap<const BlockT *, BlockFrequency>::const_iterator
I = Freqs.find(BB);
if (I != Freqs.end())
return I->second;
return 0;
}
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
void print(raw_ostream &OS) const {
OS << "\n\n---- Block Freqs ----\n";
for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
BlockT *BB = I++;
OS << " " << getBlockName(BB) << " = ";
printBlockFreq(OS, getBlockFreq(BB)) << "\n";
for (typename GraphTraits<BlockT *>::ChildIteratorType
SI = GraphTraits<BlockT *>::child_begin(BB),
SE = GraphTraits<BlockT *>::child_end(BB); SI != SE; ++SI) {
BlockT *Succ = *SI;
OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ)
<< " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
}
}
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
}
void dump() const {
print(dbgs());
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
}
// Utility method that looks up the block frequency associated with BB and
// prints it to OS.
raw_ostream &printBlockFreq(raw_ostream &OS,
const BlockT *BB) {
return printBlockFreq(OS, getBlockFreq(BB));
}
raw_ostream &printBlockFreq(raw_ostream &OS,
const BlockFrequency &Freq) const {
// Convert fixed-point number to decimal.
uint64_t Frequency = Freq.getFrequency();
OS << Frequency / EntryFreq << ".";
uint64_t Rem = Frequency % EntryFreq;
uint64_t Eps = 1;
do {
Rem *= 10;
Eps *= 10;
OS << Rem / EntryFreq;
Rem = Rem % EntryFreq;
} while (Rem >= Eps/2);
return OS;
}
blockfreq: Rewrite BlockFrequencyInfoImpl Rewrite the shared implementation of BlockFrequencyInfo and MachineBlockFrequencyInfo entirely. The old implementation had a fundamental flaw: precision losses from nested loops (or very wide branches) compounded past loop exits (and convergence points). The @nested_loops testcase at the end of test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This function has three nested loops, with branch weights in the loop headers of 1:4000 (exit:continue). The old analysis gives non-sensical results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': ---- Block Freqs ---- entry = 1.0 for.cond1.preheader = 1.00103 for.cond4.preheader = 5.5222 for.body6 = 18095.19995 for.inc8 = 4.52264 for.inc11 = 0.00109 for.end13 = 0.0 The new analysis gives correct results: Printing analysis 'Block Frequency Analysis' for function 'nested_loops': block-frequency-info: nested_loops - entry: float = 1.0, int = 8 - for.cond1.preheader: float = 4001.0, int = 32007 - for.cond4.preheader: float = 16008001.0, int = 128064007 - for.body6: float = 64048012001.0, int = 512384096007 - for.inc8: float = 16008001.0, int = 128064007 - for.inc11: float = 4001.0, int = 32007 - for.end13: float = 1.0, int = 8 Most importantly, the frequency leaving each loop matches the frequency entering it. The new algorithm leverages BlockMass and PositiveFloat to maintain precision, separates "probability mass distribution" from "loop scaling", and uses dithering to eliminate probability mass loss. I have unit tests for these types out of tree, but it was decided in the review to make the classes private to BlockFrequencyInfoImpl, and try to shrink them (or remove them entirely) in follow-up commits. The new algorithm should generally have a complexity advantage over the old. The previous algorithm was quadratic in the worst case. The new algorithm is still worst-case quadratic in the presence of irreducible control flow, but it's linear without it. The key difference between the old algorithm and the new is that control flow within a loop is evaluated separately from control flow outside, limiting propagation of precision problems and allowing loop scale to be calculated independently of mass distribution. Loops are visited bottom-up, their loop scales are calculated, and they are replaced by pseudo-nodes. Mass is then distributed through the function, which is now a DAG. Finally, loops are revisited top-down to multiply through the loop scales and the masses distributed to pseudo nodes. There are some remaining flaws. - Irreducible control flow isn't modelled correctly. LoopInfo and MachineLoopInfo ignore irreducible edges, so this algorithm will fail to scale accordingly. There's a note in the class documentation about how to get closer. See also the comments in test/Analysis/BlockFrequencyInfo/irreducible.ll. - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting the 64-bit integer precision used downstream. - The "bias" calculation proposed on llvmdev is *not* incorporated here. This will be added in a follow-up commit, once comments from this review have been handled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
};
}
#endif