Fix PR 23525 - Separate header mass propagation in irregular loops.

Summary:
When propagating mass through irregular loops, the mass flowing through
each loop header may not be equal. This was causing wrong frequencies
to be computed for irregular loop headers.

Fixed by keeping track of masses flowing through each of the headers in
an irregular loop. To do this, we now keep track of per-header backedge
weights. After the loop mass is distributed through the loop, the
backedge weights are used to re-distribute the loop mass to the loop
headers.

Since each backedge will have a mass proportional to the different
branch weights, the loop headers will end up with a more approximate
weight distribution (as opposed to the current distribution that assumes
that every loop header is the same).

Reviewers: dexonsmith

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D10348

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239843 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Diego Novillo 2015-06-16 19:10:58 +00:00
parent bfbac55e4e
commit 3f53fc8f5f
4 changed files with 173 additions and 40 deletions

View File

@ -196,23 +196,26 @@ public:
struct LoopData {
typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
typedef SmallVector<BlockNode, 4> NodeList;
LoopData *Parent; ///< The parent loop.
bool IsPackaged; ///< Whether this has been packaged.
uint32_t NumHeaders; ///< Number of headers.
ExitMap Exits; ///< Successor edges (and weights).
NodeList Nodes; ///< Header and the members of the loop.
BlockMass BackedgeMass; ///< Mass returned to loop header.
typedef SmallVector<BlockMass, 1> HeaderMassList;
LoopData *Parent; ///< The parent loop.
bool IsPackaged; ///< Whether this has been packaged.
uint32_t NumHeaders; ///< Number of headers.
ExitMap Exits; ///< Successor edges (and weights).
NodeList Nodes; ///< Header and the members of the loop.
HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
BlockMass Mass;
Scaled64 Scale;
LoopData(LoopData *Parent, const BlockNode &Header)
: Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
: Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header),
BackedgeMass(1) {}
template <class It1, class It2>
LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
It2 LastOther)
: Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
NumHeaders = Nodes.size();
Nodes.insert(Nodes.end(), FirstOther, LastOther);
BackedgeMass.resize(NumHeaders);
}
bool isHeader(const BlockNode &Node) const {
if (isIrreducible())
@ -223,6 +226,14 @@ public:
BlockNode getHeader() const { return Nodes[0]; }
bool isIrreducible() const { return NumHeaders > 1; }
HeaderMassList::difference_type headerIndexFor(const BlockNode &B) {
assert(isHeader(B) && "this is only valid on loop header blocks");
if (isIrreducible())
return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
Nodes.begin();
return 0;
}
NodeList::const_iterator members_begin() const {
return Nodes.begin() + NumHeaders;
}
@ -431,6 +442,16 @@ public:
/// \brief Compute the loop scale for a loop.
void computeLoopScale(LoopData &Loop);
/// Adjust the mass of all headers in an irreducible loop.
///
/// Initially, irreducible loops are assumed to distribute their mass
/// equally among its headers. This can lead to wrong frequency estimates
/// since some headers may be executed more frequently than others.
///
/// This adjusts header mass distribution so it matches the weights of
/// the backedges going into each of the loop headers.
void adjustLoopHeaderMass(LoopData &Loop);
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
@ -735,11 +756,6 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
/// as sub-loops, rather than arbitrarily shoving the problematic
/// blocks into the headers of the main irreducible SCC.
///
/// - Backedge frequencies are assumed to be evenly split between the
/// headers of a given irreducible SCC. Instead, we could track the
/// backedge mass separately for each header, and adjust their relative
/// frequencies.
///
/// - Entry frequencies are assumed to be evenly split between the
/// headers of a given irreducible SCC, which is the only option if we
/// need to compute mass in the SCC before its parent loop. Instead,
@ -1042,6 +1058,8 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))

View File

@ -286,7 +286,7 @@ bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
if (isLoopHeader(Resolved)) {
DEBUG(debugSuccessor("backedge"));
Dist.addBackedge(OuterLoop->getHeader(), Weight);
Dist.addBackedge(Resolved, Weight);
return true;
}
@ -349,7 +349,10 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
// LoopScale == 1 / ExitMass
// ExitMass == HeadMass - BackedgeMass
BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
BlockMass TotalBackedgeMass;
for (auto &Mass : Loop.BackedgeMass)
TotalBackedgeMass += Mass;
BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;
// Block scale stores the inverse of the scale. If this is an infinite loop,
// its exit mass will be zero. In this case, use an arbitrary scale for the
@ -358,7 +361,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
<< " - " << Loop.BackedgeMass << ")\n"
<< " - " << TotalBackedgeMass << ")\n"
<< " - scale = " << Loop.Scale << "\n");
}
@ -375,6 +378,19 @@ void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
Loop.IsPackaged = true;
}
#ifndef NDEBUG
static void debugAssign(const BlockFrequencyInfoImplBase &BFI,
const DitheringDistributer &D, const BlockNode &T,
const BlockMass &M, const char *Desc) {
dbgs() << " => assign " << M << " (" << D.RemMass << ")";
if (Desc)
dbgs() << " [" << Desc << "]";
if (T.isValid())
dbgs() << " to " << BFI.getBlockName(T);
dbgs() << "\n";
}
#endif
void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
LoopData *OuterLoop,
Distribution &Dist) {
@ -384,25 +400,12 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
// Distribute mass to successors as laid out in Dist.
DitheringDistributer D(Dist, Mass);
#ifndef NDEBUG
auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
const char *Desc) {
dbgs() << " => assign " << M << " (" << D.RemMass << ")";
if (Desc)
dbgs() << " [" << Desc << "]";
if (T.isValid())
dbgs() << " to " << getBlockName(T);
dbgs() << "\n";
};
(void)debugAssign;
#endif
for (const Weight &W : Dist.Weights) {
// Check for a local edge (non-backedge and non-exit).
BlockMass Taken = D.takeMass(W.Amount);
if (W.Type == Weight::Local) {
Working[W.TargetNode.Index].getMass() += Taken;
DEBUG(debugAssign(W.TargetNode, Taken, nullptr));
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
continue;
}
@ -411,15 +414,16 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
// Check for a backedge.
if (W.Type == Weight::Backedge) {
OuterLoop->BackedgeMass += Taken;
DEBUG(debugAssign(BlockNode(), Taken, "back"));
auto ix = OuterLoop->headerIndexFor(W.TargetNode);
OuterLoop->BackedgeMass[ix] += Taken;
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
continue;
}
// This must be an exit.
assert(W.Type == Weight::Exit);
OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
DEBUG(debugAssign(W.TargetNode, Taken, "exit"));
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
}
}
@ -713,10 +717,44 @@ BlockFrequencyInfoImplBase::analyzeIrreducible(
void
BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
OuterLoop.Exits.clear();
OuterLoop.BackedgeMass = BlockMass::getEmpty();
for (auto &Mass : OuterLoop.BackedgeMass)
Mass = BlockMass::getEmpty();
auto O = OuterLoop.Nodes.begin() + 1;
for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
if (!Working[I->Index].isPackaged())
*O++ = *I;
OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
}
void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");
// Since the loop has more than one header block, the mass flowing back into
// each header will be different. Adjust the mass in each header loop to
// reflect the masses flowing through back edges.
//
// To do this, we distribute the initial mass using the backedge masses
// as weights for the distribution.
BlockMass LoopMass = BlockMass::getFull();
Distribution Dist;
DEBUG(dbgs() << "adjust-loop-header-mass:\n");
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
auto &HeaderNode = Loop.Nodes[H];
auto &BackedgeMass = Loop.BackedgeMass[Loop.headerIndexFor(HeaderNode)];
DEBUG(dbgs() << " - Add back edge mass for node "
<< getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
Dist.addLocal(HeaderNode, BackedgeMass.getMass());
}
DitheringDistributer D(Dist, LoopMass);
DEBUG(dbgs() << " Distribute loop mass " << LoopMass
<< " to headers using above weights\n");
for (const Weight &W : Dist.Weights) {
BlockMass Taken = D.takeMass(W.Amount);
assert(W.Type == Weight::Local && "all weights should be local");
Working[W.TargetNode.Index].getMass() = Taken;
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
}
}

View File

@ -0,0 +1,80 @@
; RUN: opt < %s -analyze -block-freq | FileCheck %s
@g = global i32 0, align 4
; Function Attrs: inlinehint noinline nounwind uwtable
define i32 @_Z8hot_loopi(i32 %n) !prof !1 {
entry:
%div = sdiv i32 %n, 2
%rem12 = and i32 %n, 1
%cmp = icmp eq i32 %rem12, 0
br i1 %cmp, label %Next, label %for.cond, !prof !2
; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206
for.cond: ; preds = %entry, %for.inc
%i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ]
%cmp1 = icmp slt i32 %i.0, %n
br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4
; CHECK: - for.body: float = 24.52, int = 196
for.body: ; preds = %for.cond
%rem213 = and i32 %i.0, 1
%cmp3 = icmp eq i32 %rem213, 0
br i1 %cmp3, label %if.then.4, label %Next, !prof !6
; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98
if.then.4: ; preds = %for.body
%0 = load i32, i32* @g, align 4, !tbaa !7
%mul = shl nsw i32 %0, 1
br label %for.inc
; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99
Next: ; preds = %for.body, %entry
%i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ]
%1 = load i32, i32* @g, align 4, !tbaa !7
%add = add nsw i32 %1, %n
br label %for.inc
; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306
for.inc: ; preds = %if.then.4, %Next
%storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ]
%i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ]
store i32 %storemerge, i32* @g, align 4, !tbaa !7
%inc = add nsw i32 %i.2, 1
br label %for.cond
; CHECK: - for.end: float = 1.0, int = 8
for.end: ; preds = %for.cond
%2 = load i32, i32* @g, align 4, !tbaa !7
ret i32 %2
}
; Function Attrs: nounwind uwtable
define i32 @main() !prof !11 {
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret i32 0
for.body: ; preds = %for.body, %entry
%i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
%call = tail call i32 @_Z8hot_loopi(i32 %i.04)
%inc = add nuw nsw i32 %i.04, 1
%exitcond = icmp eq i32 %inc, 100
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12
}
!1 = !{!"function_entry_count", i64 99}
!2 = !{!"branch_weights", i32 50, i32 51}
!3 = !{!"branch_weights", i32 2452, i32 100}
!4 = distinct !{!4, !5}
!5 = !{!"llvm.loop.unroll.disable"}
!6 = !{!"branch_weights", i32 1227, i32 1226}
!7 = !{!8, !8, i64 0}
!8 = !{!"int", !9, i64 0}
!9 = !{!"omnipotent char", !10, i64 0}
!10 = !{!"Simple C/C++ TBAA"}
!11 = !{!"function_entry_count", i64 1}
!12 = !{!"branch_weights", i32 2, i32 100}

View File

@ -130,9 +130,6 @@ exit:
; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
; infinite series sums to 1.
;
; Since the currently algorithm *always* assumes entry blocks are equal,
; -block-freq gets the right answers here.
define void @crossloops(i2 %x) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
; CHECK-NEXT: block-frequency-info: crossloops
@ -386,7 +383,7 @@ exit:
;
; This testcases uses non-trivial branch weights. The CHECK statements here
; will start to fail if we change -block-freq to be more accurate. Currently,
; we expect left, right and top to be treated as equal headers.
; loop headers are affected by the weight of their corresponding back edges.
define void @nonentry_header(i1 %x, i2 %y) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header':
; CHECK-NEXT: block-frequency-info: nonentry_header
@ -395,15 +392,15 @@ entry:
br i1 %x, label %left, label %right, !prof !21
left:
; CHECK-NEXT: left: float = 3.0,
; CHECK-NEXT: left: float = 0.14{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
right:
; CHECK-NEXT: right: float = 3.0,
; CHECK-NEXT: right: float = 0.42{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
top:
; CHECK-NEXT: top: float = 3.0,
; CHECK-NEXT: top: float = 8.43{{[0-9]*}},
switch i2 %y, label %exit [ i2 0, label %left
i2 1, label %right
i2 2, label %bottom ], !prof !23