2014-04-11 23:20:58 +00:00
|
|
|
//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===//
|
2011-06-23 21:56:59 +00:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2014-04-18 17:56:08 +00:00
|
|
|
// Shared implementation of BlockFrequencyInfo for IR and Machine Instructions.
|
2011-06-23 21:56:59 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-04-11 23:20:58 +00:00
|
|
|
#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
|
|
|
|
#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
|
2011-06-23 21:56:59 +00:00
|
|
|
|
|
|
|
#include "llvm/ADT/DenseMap.h"
|
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2014-04-18 17:56:08 +00:00
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2013-01-02 11:36:10 +00:00
|
|
|
#include "llvm/IR/BasicBlock.h"
|
2011-07-27 22:05:51 +00:00
|
|
|
#include "llvm/Support/BlockFrequency.h"
|
2011-06-23 21:56:59 +00:00
|
|
|
#include "llvm/Support/BranchProbability.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2011-07-16 20:23:20 +00:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2011-06-23 21:56:59 +00:00
|
|
|
#include <string>
|
2012-12-03 17:02:12 +00:00
|
|
|
#include <vector>
|
2011-06-23 21:56:59 +00:00
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
class BranchProbabilityInfo;
|
|
|
|
class BlockFrequencyInfo;
|
|
|
|
class MachineBranchProbabilityInfo;
|
|
|
|
class MachineBlockFrequencyInfo;
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
namespace bfi_detail {
|
|
|
|
template <class BlockT> struct TypeMap {};
|
|
|
|
template <> struct TypeMap<BasicBlock> {
|
|
|
|
typedef BasicBlock BlockT;
|
|
|
|
typedef Function FunctionT;
|
|
|
|
typedef BranchProbabilityInfo BranchProbabilityInfoT;
|
2014-04-11 23:21:02 +00:00
|
|
|
};
|
2014-04-18 17:56:08 +00:00
|
|
|
template <> struct TypeMap<MachineBasicBlock> {
|
|
|
|
typedef MachineBasicBlock BlockT;
|
|
|
|
typedef MachineFunction FunctionT;
|
|
|
|
typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
|
2014-04-11 23:21:02 +00:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// BlockFrequencyInfoImpl implements block frequency algorithm for IR and
|
|
|
|
/// Machine Instructions. Algorithm starts with value ENTRY_FREQ
|
|
|
|
/// for the entry block and then propagates frequencies using branch weights
|
|
|
|
/// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
|
|
|
|
/// algorithm can find "backedges" by itself.
|
|
|
|
template <class BT>
|
|
|
|
class BlockFrequencyInfoImpl {
|
|
|
|
typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
|
|
|
|
typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
|
|
|
|
typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT
|
|
|
|
BranchProbabilityInfoT;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
DenseMap<const BlockT *, BlockFrequency> Freqs;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BranchProbabilityInfoT *BPI;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
FunctionT *Fn;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
typedef GraphTraits< Inverse<BlockT *> > GT;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
static const uint64_t EntryFreq = 1 << 14;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
std::string getBlockName(BasicBlock *BB) const {
|
|
|
|
return BB->getName().str();
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
std::string getBlockName(MachineBasicBlock *MBB) const {
|
|
|
|
std::string str;
|
|
|
|
raw_string_ostream ss(str);
|
|
|
|
ss << "BB#" << MBB->getNumber();
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (const BasicBlock *BB = MBB->getBasicBlock())
|
|
|
|
ss << " derived from LLVM BB " << BB->getName();
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
return ss.str();
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
|
|
|
|
Freqs[BB] = Freq;
|
|
|
|
DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
|
|
|
|
printBlockFreq(dbgs(), Freq) << "\n");
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
|
|
|
|
/// edge probability.
|
|
|
|
BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
|
|
|
|
BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
|
|
|
|
return getBlockFreq(Src) * Prob;
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// incBlockFreq - Increase BB block frequency by FREQ.
|
|
|
|
///
|
|
|
|
void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
|
|
|
|
Freqs[BB] += Freq;
|
|
|
|
DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
|
|
|
|
printBlockFreq(dbgs(), Freq) << " --> ";
|
|
|
|
printBlockFreq(dbgs(), Freqs[BB]) << "\n");
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// All blocks in postorder.
|
|
|
|
std::vector<BlockT *> POT;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// Map Block -> Position in reverse-postorder list.
|
|
|
|
DenseMap<BlockT *, unsigned> RPO;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// For each loop header, record the per-iteration probability of exiting the
|
|
|
|
// loop. This is the reciprocal of the expected number of loop iterations.
|
|
|
|
typedef DenseMap<BlockT*, BranchProbability> LoopExitProbMap;
|
|
|
|
LoopExitProbMap LoopExitProb;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// (reverse-)postorder traversal iterators.
|
|
|
|
typedef typename std::vector<BlockT *>::iterator pot_iterator;
|
|
|
|
typedef typename std::vector<BlockT *>::reverse_iterator rpot_iterator;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
pot_iterator pot_begin() { return POT.begin(); }
|
|
|
|
pot_iterator pot_end() { return POT.end(); }
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
rpot_iterator rpot_begin() { return POT.rbegin(); }
|
|
|
|
rpot_iterator rpot_end() { return POT.rend(); }
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
rpot_iterator rpot_at(BlockT *BB) {
|
|
|
|
rpot_iterator I = rpot_begin();
|
|
|
|
unsigned idx = RPO.lookup(BB);
|
|
|
|
assert(idx);
|
|
|
|
std::advance(I, idx - 1);
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
assert(*I == BB);
|
|
|
|
return I;
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// isBackedge - Return if edge Src -> Dst is a reachable backedge.
|
2011-06-23 21:56:59 +00:00
|
|
|
///
|
2014-04-18 17:56:08 +00:00
|
|
|
bool isBackedge(BlockT *Src, BlockT *Dst) const {
|
|
|
|
unsigned a = RPO.lookup(Src);
|
|
|
|
if (!a)
|
|
|
|
return false;
|
|
|
|
unsigned b = RPO.lookup(Dst);
|
|
|
|
assert(b && "Destination block should be reachable");
|
|
|
|
return a >= b;
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// getSingleBlockPred - return single BB block predecessor or NULL if
|
|
|
|
/// BB has none or more predecessors.
|
|
|
|
BlockT *getSingleBlockPred(BlockT *BB) {
|
|
|
|
typename GT::ChildIteratorType
|
|
|
|
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
|
|
|
|
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (PI == PE)
|
|
|
|
return nullptr;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BlockT *Pred = *PI;
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
++PI;
|
|
|
|
if (PI != PE)
|
|
|
|
return nullptr;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
return Pred;
|
|
|
|
}
|
2014-04-18 02:17:43 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
void doBlock(BlockT *BB, BlockT *LoopHead,
|
|
|
|
SmallPtrSet<BlockT *, 8> &BlocksInLoop) {
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
|
|
|
|
setBlockFreq(BB, 0);
|
2014-04-18 02:17:43 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (BB == LoopHead) {
|
|
|
|
setBlockFreq(BB, EntryFreq);
|
|
|
|
return;
|
2014-04-18 17:22:25 +00:00
|
|
|
}
|
2014-04-18 17:56:08 +00:00
|
|
|
|
|
|
|
if (BlockT *Pred = getSingleBlockPred(BB)) {
|
|
|
|
if (BlocksInLoop.count(Pred))
|
|
|
|
setBlockFreq(BB, getEdgeFreq(Pred, BB));
|
|
|
|
// TODO: else? irreducible, ignore it for now.
|
|
|
|
return;
|
2014-04-18 17:22:25 +00:00
|
|
|
}
|
2014-04-18 17:56:08 +00:00
|
|
|
|
|
|
|
bool isInLoop = false;
|
|
|
|
bool isLoopHead = false;
|
|
|
|
|
|
|
|
for (typename GT::ChildIteratorType
|
|
|
|
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
|
|
|
|
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
|
|
|
|
PI != PE; ++PI) {
|
|
|
|
BlockT *Pred = *PI;
|
|
|
|
|
|
|
|
if (isBackedge(Pred, BB)) {
|
|
|
|
isLoopHead = true;
|
|
|
|
} else if (BlocksInLoop.count(Pred)) {
|
|
|
|
incBlockFreq(BB, getEdgeFreq(Pred, BB));
|
|
|
|
isInLoop = true;
|
|
|
|
}
|
|
|
|
// TODO: else? irreducible.
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (!isInLoop)
|
|
|
|
return;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (!isLoopHead)
|
|
|
|
return;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// This block is a loop header, so boost its frequency by the expected
|
|
|
|
// number of loop iterations. The loop blocks will be revisited so they all
|
|
|
|
// get this boost.
|
|
|
|
typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
|
|
|
|
assert(I != LoopExitProb.end() && "Loop header missing from table");
|
|
|
|
Freqs[BB] /= I->second;
|
|
|
|
DEBUG(dbgs() << "Loop header scaled to ";
|
|
|
|
printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// doLoop - Propagate block frequency down through the loop.
|
|
|
|
void doLoop(BlockT *Head, BlockT *Tail) {
|
|
|
|
DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
|
|
|
|
<< getBlockName(Tail) << ")\n");
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
SmallPtrSet<BlockT *, 8> BlocksInLoop;
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
|
|
|
|
BlockT *BB = *I;
|
|
|
|
doBlock(BB, Head, BlocksInLoop);
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BlocksInLoop.insert(BB);
|
|
|
|
if (I == E)
|
|
|
|
break;
|
|
|
|
}
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// Compute loop's cyclic probability using backedges probabilities.
|
|
|
|
BlockFrequency BackFreq;
|
|
|
|
for (typename GT::ChildIteratorType
|
|
|
|
PI = GraphTraits< Inverse<BlockT *> >::child_begin(Head),
|
|
|
|
PE = GraphTraits< Inverse<BlockT *> >::child_end(Head);
|
|
|
|
PI != PE; ++PI) {
|
|
|
|
BlockT *Pred = *PI;
|
|
|
|
assert(Pred);
|
|
|
|
if (isBackedge(Pred, Head))
|
|
|
|
BackFreq += getEdgeFreq(Pred, Head);
|
|
|
|
}
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
|
|
|
|
// only counts edges entering the loop, not the loop backedges.
|
|
|
|
// The probability of leaving the loop on each iteration is:
|
|
|
|
//
|
|
|
|
// ExitProb = 1 - CyclicProb
|
|
|
|
//
|
|
|
|
// The Expected number of loop iterations is:
|
|
|
|
//
|
|
|
|
// Iterations = 1 / ExitProb
|
|
|
|
//
|
|
|
|
uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
|
|
|
|
uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
|
|
|
|
if (N < D)
|
|
|
|
N = D - N;
|
|
|
|
else
|
|
|
|
// We'd expect N < D, but rounding and saturation means that can't be
|
|
|
|
// guaranteed.
|
|
|
|
N = 1;
|
|
|
|
|
|
|
|
// Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
|
|
|
|
assert(N <= D);
|
|
|
|
if (D > UINT32_MAX) {
|
|
|
|
unsigned Shift = 32 - countLeadingZeros(D);
|
|
|
|
D >>= Shift;
|
|
|
|
N >>= Shift;
|
|
|
|
if (N == 0)
|
|
|
|
N = 1;
|
|
|
|
}
|
|
|
|
BranchProbability LEP = BranchProbability(N, D);
|
|
|
|
LoopExitProb.insert(std::make_pair(Head, LEP));
|
|
|
|
DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
|
|
|
|
<< " from 1 - ";
|
|
|
|
printBlockFreq(dbgs(), BackFreq) << " / ";
|
|
|
|
printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
friend class BlockFrequencyInfo;
|
|
|
|
friend class MachineBlockFrequencyInfo;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BlockFrequencyInfoImpl() { }
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) {
|
|
|
|
Fn = fn;
|
|
|
|
BPI = bpi;
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// Clear everything.
|
|
|
|
RPO.clear();
|
|
|
|
POT.clear();
|
|
|
|
LoopExitProb.clear();
|
|
|
|
Freqs.clear();
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BlockT *EntryBlock = fn->begin();
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
unsigned RPOidx = 0;
|
|
|
|
for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
|
|
|
|
BlockT *BB = *I;
|
|
|
|
RPO[BB] = ++RPOidx;
|
|
|
|
DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
|
|
|
|
}
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// Travel over all blocks in postorder.
|
|
|
|
for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
|
|
|
|
BlockT *BB = *I;
|
|
|
|
BlockT *LastTail = nullptr;
|
|
|
|
DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
|
2011-06-23 21:56:59 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
for (typename GT::ChildIteratorType
|
|
|
|
PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
|
|
|
|
PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
|
|
|
|
PI != PE; ++PI) {
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
BlockT *Pred = *PI;
|
|
|
|
if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
|
|
|
|
LastTail = Pred;
|
|
|
|
}
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
if (LastTail)
|
|
|
|
doLoop(BB, LastTail);
|
|
|
|
}
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// At the end assume the whole function as a loop, and travel over it once
|
|
|
|
// again.
|
|
|
|
doLoop(*(rpot_begin()), *(pot_begin()));
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
2013-12-13 23:44:36 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
uint64_t getEntryFreq() { return EntryFreq; }
|
2013-12-13 23:44:36 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
/// getBlockFreq - Return block frequency. Return 0 if we don't have it.
|
2011-12-20 20:03:10 +00:00
|
|
|
BlockFrequency getBlockFreq(const BlockT *BB) const {
|
2014-04-18 17:56:08 +00:00
|
|
|
typename DenseMap<const BlockT *, BlockFrequency>::const_iterator
|
|
|
|
I = Freqs.find(BB);
|
|
|
|
if (I != Freqs.end())
|
|
|
|
return I->second;
|
|
|
|
return 0;
|
2013-12-13 23:59:44 +00:00
|
|
|
}
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
void print(raw_ostream &OS) const {
|
|
|
|
OS << "\n\n---- Block Freqs ----\n";
|
|
|
|
for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
|
|
|
|
BlockT *BB = I++;
|
|
|
|
OS << " " << getBlockName(BB) << " = ";
|
|
|
|
printBlockFreq(OS, getBlockFreq(BB)) << "\n";
|
|
|
|
|
|
|
|
for (typename GraphTraits<BlockT *>::ChildIteratorType
|
|
|
|
SI = GraphTraits<BlockT *>::child_begin(BB),
|
|
|
|
SE = GraphTraits<BlockT *>::child_end(BB); SI != SE; ++SI) {
|
|
|
|
BlockT *Succ = *SI;
|
|
|
|
OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ)
|
|
|
|
<< " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
|
|
|
|
}
|
|
|
|
}
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
void dump() const {
|
|
|
|
print(dbgs());
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
}
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
// Utility method that looks up the block frequency associated with BB and
|
|
|
|
// prints it to OS.
|
|
|
|
raw_ostream &printBlockFreq(raw_ostream &OS,
|
|
|
|
const BlockT *BB) {
|
|
|
|
return printBlockFreq(OS, getBlockFreq(BB));
|
2014-04-18 17:22:25 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
raw_ostream &printBlockFreq(raw_ostream &OS,
|
|
|
|
const BlockFrequency &Freq) const {
|
|
|
|
// Convert fixed-point number to decimal.
|
|
|
|
uint64_t Frequency = Freq.getFrequency();
|
|
|
|
OS << Frequency / EntryFreq << ".";
|
|
|
|
uint64_t Rem = Frequency % EntryFreq;
|
|
|
|
uint64_t Eps = 1;
|
|
|
|
do {
|
|
|
|
Rem *= 10;
|
|
|
|
Eps *= 10;
|
|
|
|
OS << Rem / EntryFreq;
|
|
|
|
Rem = Rem % EntryFreq;
|
|
|
|
} while (Rem >= Eps/2);
|
|
|
|
return OS;
|
2014-04-18 17:22:25 +00:00
|
|
|
}
|
blockfreq: Rewrite BlockFrequencyInfoImpl
Rewrite the shared implementation of BlockFrequencyInfo and
MachineBlockFrequencyInfo entirely.
The old implementation had a fundamental flaw: precision losses from
nested loops (or very wide branches) compounded past loop exits (and
convergence points).
The @nested_loops testcase at the end of
test/Analysis/BlockFrequencyAnalysis/basic.ll is motivating. This
function has three nested loops, with branch weights in the loop headers
of 1:4000 (exit:continue). The old analysis gives non-sensical results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
---- Block Freqs ----
entry = 1.0
for.cond1.preheader = 1.00103
for.cond4.preheader = 5.5222
for.body6 = 18095.19995
for.inc8 = 4.52264
for.inc11 = 0.00109
for.end13 = 0.0
The new analysis gives correct results:
Printing analysis 'Block Frequency Analysis' for function 'nested_loops':
block-frequency-info: nested_loops
- entry: float = 1.0, int = 8
- for.cond1.preheader: float = 4001.0, int = 32007
- for.cond4.preheader: float = 16008001.0, int = 128064007
- for.body6: float = 64048012001.0, int = 512384096007
- for.inc8: float = 16008001.0, int = 128064007
- for.inc11: float = 4001.0, int = 32007
- for.end13: float = 1.0, int = 8
Most importantly, the frequency leaving each loop matches the frequency
entering it.
The new algorithm leverages BlockMass and PositiveFloat to maintain
precision, separates "probability mass distribution" from "loop
scaling", and uses dithering to eliminate probability mass loss. I have
unit tests for these types out of tree, but it was decided in the review
to make the classes private to BlockFrequencyInfoImpl, and try to shrink
them (or remove them entirely) in follow-up commits.
The new algorithm should generally have a complexity advantage over the
old. The previous algorithm was quadratic in the worst case. The new
algorithm is still worst-case quadratic in the presence of irreducible
control flow, but it's linear without it.
The key difference between the old algorithm and the new is that control
flow within a loop is evaluated separately from control flow outside,
limiting propagation of precision problems and allowing loop scale to be
calculated independently of mass distribution. Loops are visited
bottom-up, their loop scales are calculated, and they are replaced by
pseudo-nodes. Mass is then distributed through the function, which is
now a DAG. Finally, loops are revisited top-down to multiply through
the loop scales and the masses distributed to pseudo nodes.
There are some remaining flaws.
- Irreducible control flow isn't modelled correctly. LoopInfo and
MachineLoopInfo ignore irreducible edges, so this algorithm will
fail to scale accordingly. There's a note in the class
documentation about how to get closer. See also the comments in
test/Analysis/BlockFrequencyInfo/irreducible.ll.
- Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
the 64-bit integer precision used downstream.
- The "bias" calculation proposed on llvmdev is *not* incorporated
here. This will be added in a follow-up commit, once comments from
this review have been handled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206548 91177308-0d34-0410-b5e6-96231b3b80d8
2014-04-18 01:57:45 +00:00
|
|
|
|
2014-04-18 17:56:08 +00:00
|
|
|
};
|
2014-04-18 17:22:25 +00:00
|
|
|
|
2011-06-23 21:56:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|