mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 00:32:55 +00:00
Adds cyclic critical path computation and heuristics, temporarily disabled.
Estimate the cyclic critical path within a single block loop. If the acyclic critical path is longer, then the loop will exhaust OOO resources after some number of iterations. If lag between the acyclic critical path and cyclic critical path is longer the the time it takes to issue those loop iterations, then aggressively schedule for latency. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189120 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
99093638a0
commit
ea57433cee
@ -197,6 +197,9 @@ namespace llvm {
|
|||||||
/// input.
|
/// input.
|
||||||
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
|
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
|
||||||
|
|
||||||
|
/// Compute the cyclic critical path through the DAG.
|
||||||
|
unsigned computeCyclicCriticalPath();
|
||||||
|
|
||||||
/// addSchedBarrierDeps - Add dependencies from instructions in the current
|
/// addSchedBarrierDeps - Add dependencies from instructions in the current
|
||||||
/// list of instructions being scheduled to scheduling barrier. We want to
|
/// list of instructions being scheduled to scheduling barrier. We want to
|
||||||
/// make sure instructions which define registers that are either used by
|
/// make sure instructions which define registers that are either used by
|
||||||
|
@ -53,6 +53,9 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
|
|||||||
static bool ViewMISchedDAGs = false;
|
static bool ViewMISchedDAGs = false;
|
||||||
#endif // NDEBUG
|
#endif // NDEBUG
|
||||||
|
|
||||||
|
static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
|
||||||
|
cl::desc("Enable cyclic critical path analysis."), cl::init(false));
|
||||||
|
|
||||||
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
|
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
|
||||||
cl::desc("Enable load clustering."), cl::init(true));
|
cl::desc("Enable load clustering."), cl::init(true));
|
||||||
|
|
||||||
@ -1207,16 +1210,21 @@ public:
|
|||||||
struct SchedRemainder {
|
struct SchedRemainder {
|
||||||
// Critical path through the DAG in expected latency.
|
// Critical path through the DAG in expected latency.
|
||||||
unsigned CriticalPath;
|
unsigned CriticalPath;
|
||||||
|
unsigned CyclicCritPath;
|
||||||
|
|
||||||
// Scaled count of micro-ops left to schedule.
|
// Scaled count of micro-ops left to schedule.
|
||||||
unsigned RemIssueCount;
|
unsigned RemIssueCount;
|
||||||
|
|
||||||
|
bool IsAcyclicLatencyLimited;
|
||||||
|
|
||||||
// Unscheduled resources
|
// Unscheduled resources
|
||||||
SmallVector<unsigned, 16> RemainingCounts;
|
SmallVector<unsigned, 16> RemainingCounts;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
CriticalPath = 0;
|
CriticalPath = 0;
|
||||||
|
CyclicCritPath = 0;
|
||||||
RemIssueCount = 0;
|
RemIssueCount = 0;
|
||||||
|
IsAcyclicLatencyLimited = false;
|
||||||
RemainingCounts.clear();
|
RemainingCounts.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1434,6 +1442,8 @@ public:
|
|||||||
virtual void registerRoots();
|
virtual void registerRoots();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
void checkAcyclicLatency();
|
||||||
|
|
||||||
void tryCandidate(SchedCandidate &Cand,
|
void tryCandidate(SchedCandidate &Cand,
|
||||||
SchedCandidate &TryCand,
|
SchedCandidate &TryCand,
|
||||||
SchedBoundary &Zone,
|
SchedBoundary &Zone,
|
||||||
@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
|
|||||||
Bot.releaseNode(SU, SU->BotReadyCycle);
|
Bot.releaseNode(SU, SU->BotReadyCycle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ConvergingScheduler::checkAcyclicLatency() {
|
||||||
|
if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
|
||||||
|
return;
|
||||||
|
|
||||||
|
unsigned BufferLimit =
|
||||||
|
SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
|
||||||
|
unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
|
||||||
|
Rem.IsAcyclicLatencyLimited =
|
||||||
|
(LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
|
||||||
|
<< Rem.RemIssueCount << "u = "
|
||||||
|
<< (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. "
|
||||||
|
<< "Latency = " << LatencyLag << "c = "
|
||||||
|
<< LatencyLag * SchedModel->getLatencyFactor() << "u\n";
|
||||||
|
if (Rem.IsAcyclicLatencyLimited)
|
||||||
|
dbgs() << " ACYCLIC LATENCY LIMIT\n");
|
||||||
|
}
|
||||||
|
|
||||||
void ConvergingScheduler::registerRoots() {
|
void ConvergingScheduler::registerRoots() {
|
||||||
Rem.CriticalPath = DAG->ExitSU.getDepth();
|
Rem.CriticalPath = DAG->ExitSU.getDepth();
|
||||||
|
|
||||||
|
if (EnableCyclicPath) {
|
||||||
|
Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
|
||||||
|
checkAcyclicLatency();
|
||||||
|
}
|
||||||
// Some roots may not feed into ExitSU. Check all of them in case.
|
// Some roots may not feed into ExitSU. Check all of them in case.
|
||||||
for (std::vector<SUnit*>::const_iterator
|
for (std::vector<SUnit*>::const_iterator
|
||||||
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
|
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
|
||||||
@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
|
||||||
|
ConvergingScheduler::SchedCandidate &Cand,
|
||||||
|
ConvergingScheduler::SchedBoundary &Zone) {
|
||||||
|
if (Zone.isTop()) {
|
||||||
|
if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
|
||||||
|
if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
|
||||||
|
TryCand, Cand, ConvergingScheduler::TopDepthReduce))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
|
||||||
|
TryCand, Cand, ConvergingScheduler::TopPathReduce))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
|
||||||
|
if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
|
||||||
|
TryCand, Cand, ConvergingScheduler::BotHeightReduce))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
|
||||||
|
TryCand, Cand, ConvergingScheduler::BotPathReduce))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/// Apply a set of heursitics to a new candidate. Heuristics are currently
|
/// Apply a set of heursitics to a new candidate. Heuristics are currently
|
||||||
/// hierarchical. This may be more efficient than a graduated cost model because
|
/// hierarchical. This may be more efficient than a graduated cost model because
|
||||||
/// we don't need to evaluate all aspects of the model for each node in the
|
/// we don't need to evaluate all aspects of the model for each node in the
|
||||||
@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
|
|||||||
RegExcess))
|
RegExcess))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// For loops that are acyclic path limited, aggressively schedule for latency.
|
||||||
|
if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone))
|
||||||
|
return;
|
||||||
|
|
||||||
// Avoid increasing the max critical pressure in the scheduled region.
|
// Avoid increasing the max critical pressure in the scheduled region.
|
||||||
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
|
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
|
||||||
TryCand, Cand, RegCritical))
|
TryCand, Cand, RegCritical))
|
||||||
@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// Avoid serializing long latency dependence chains.
|
// Avoid serializing long latency dependence chains.
|
||||||
if (Cand.Policy.ReduceLatency) {
|
// For acyclic path limited loops, latency was already checked above.
|
||||||
if (Zone.isTop()) {
|
if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
|
||||||
if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
|
&& tryLatency(TryCand, Cand, Zone)) {
|
||||||
if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
|
return;
|
||||||
TryCand, Cand, TopDepthReduce))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
|
|
||||||
TryCand, Cand, TopPathReduce))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
|
|
||||||
if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
|
|
||||||
TryCand, Cand, BotHeightReduce))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
|
|
||||||
TryCand, Cand, BotPathReduce))
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prefer immediate defs/users of the last scheduled instruction. This is a
|
// Prefer immediate defs/users of the last scheduled instruction. This is a
|
||||||
|
@ -36,6 +36,8 @@
|
|||||||
#include "llvm/Target/TargetMachine.h"
|
#include "llvm/Target/TargetMachine.h"
|
||||||
#include "llvm/Target/TargetRegisterInfo.h"
|
#include "llvm/Target/TargetRegisterInfo.h"
|
||||||
#include "llvm/Target/TargetSubtargetInfo.h"
|
#include "llvm/Target/TargetSubtargetInfo.h"
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
|
static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
|
||||||
@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
|
|||||||
PendingLoads.clear();
|
PendingLoads.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute the max cyclic critical path through the DAG. For loops that span
|
||||||
|
/// basic blocks, MachineTraceMetrics should be used for this instead.
|
||||||
|
unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
|
||||||
|
// This only applies to single block loop.
|
||||||
|
if (!BB->isSuccessor(BB))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
unsigned MaxCyclicLatency = 0;
|
||||||
|
// Visit each live out vreg def to find def/use pairs that cross iterations.
|
||||||
|
for (SUnit::const_pred_iterator
|
||||||
|
PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) {
|
||||||
|
MachineInstr *MI = PI->getSUnit()->getInstr();
|
||||||
|
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
||||||
|
const MachineOperand &MO = MI->getOperand(i);
|
||||||
|
if (!MO.isReg() || !MO.isDef())
|
||||||
|
break;
|
||||||
|
unsigned Reg = MO.getReg();
|
||||||
|
if (!Reg || TRI->isPhysicalRegister(Reg))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const LiveInterval &LI = LIS->getInterval(Reg);
|
||||||
|
unsigned LiveOutHeight = PI->getSUnit()->getHeight();
|
||||||
|
unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency();
|
||||||
|
// Visit all local users of the vreg def.
|
||||||
|
for (VReg2UseMap::iterator
|
||||||
|
UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
|
||||||
|
if (UI->SU == &ExitSU)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Only consider uses of the phi.
|
||||||
|
LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
|
||||||
|
if (!LRQ.valueIn()->isPHIDef())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Cheat a bit and assume that a path spanning two iterations is a
|
||||||
|
// cycle, which could overestimate in strange cases. This allows cyclic
|
||||||
|
// latency to be estimated as the minimum height or depth slack.
|
||||||
|
unsigned CyclicLatency = 0;
|
||||||
|
if (LiveOutDepth > UI->SU->getDepth())
|
||||||
|
CyclicLatency = LiveOutDepth - UI->SU->getDepth();
|
||||||
|
unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency();
|
||||||
|
if (LiveInHeight > LiveOutHeight) {
|
||||||
|
if (LiveInHeight - LiveOutHeight < CyclicLatency)
|
||||||
|
CyclicLatency = LiveInHeight - LiveOutHeight;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
CyclicLatency = 0;
|
||||||
|
DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum
|
||||||
|
<< ") -> SU(" << UI->SU->NodeNum << ") = "
|
||||||
|
<< CyclicLatency << "\n");
|
||||||
|
if (CyclicLatency > MaxCyclicLatency)
|
||||||
|
MaxCyclicLatency = CyclicLatency;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n");
|
||||||
|
return MaxCyclicLatency;
|
||||||
|
}
|
||||||
|
|
||||||
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
|
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
|
||||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||||
SU->getInstr()->dump();
|
SU->getInstr()->dump();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user