diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h index 4a447e2f4af..1b813141a8a 100644 --- a/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -197,6 +197,9 @@ namespace llvm { /// input. void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0); + /// Compute the cyclic critical path through the DAG. + unsigned computeCyclicCriticalPath(); + /// addSchedBarrierDeps - Add dependencies from instructions in the current /// list of instructions being scheduled to scheduling barrier. We want to /// make sure instructions which define registers that are either used by diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index da6920575db..36eeb67d642 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -53,6 +53,9 @@ static cl::opt MISchedCutoff("misched-cutoff", cl::Hidden, static bool ViewMISchedDAGs = false; #endif // NDEBUG +static cl::opt EnableCyclicPath("misched-cyclicpath", cl::Hidden, + cl::desc("Enable cyclic critical path analysis."), cl::init(false)); + static cl::opt EnableLoadCluster("misched-cluster", cl::Hidden, cl::desc("Enable load clustering."), cl::init(true)); @@ -1207,16 +1210,21 @@ public: struct SchedRemainder { // Critical path through the DAG in expected latency. unsigned CriticalPath; + unsigned CyclicCritPath; // Scaled count of micro-ops left to schedule. unsigned RemIssueCount; + bool IsAcyclicLatencyLimited; + // Unscheduled resources SmallVector RemainingCounts; void reset() { CriticalPath = 0; + CyclicCritPath = 0; RemIssueCount = 0; + IsAcyclicLatencyLimited = false; RemainingCounts.clear(); } @@ -1434,6 +1442,8 @@ public: virtual void registerRoots(); protected: + void checkAcyclicLatency(); + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone, @@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) { Bot.releaseNode(SU, SU->BotReadyCycle); } +void ConvergingScheduler::checkAcyclicLatency() { + if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) + return; + + unsigned BufferLimit = + SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); + unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath; + Rem.IsAcyclicLatencyLimited = + (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit; + + DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / " + << Rem.RemIssueCount << "u = " + << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. " + << "Latency = " << LatencyLag << "c = " + << LatencyLag * SchedModel->getLatencyFactor() << "u\n"; + if (Rem.IsAcyclicLatencyLimited) + dbgs() << " ACYCLIC LATENCY LIMIT\n"); +} + void ConvergingScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); + + if (EnableCyclicPath) { + Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); + checkAcyclicLatency(); + } // Some roots may not feed into ExitSU. Check all of them in case. for (std::vector::const_iterator I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { @@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) { return 0; } +static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand, + ConvergingScheduler::SchedCandidate &Cand, + ConvergingScheduler::SchedBoundary &Zone) { + if (Zone.isTop()) { + if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { + if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), + TryCand, Cand, ConvergingScheduler::TopDepthReduce)) + return true; + } + if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), + TryCand, Cand, ConvergingScheduler::TopPathReduce)) + return true; + } + else { + if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { + if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), + TryCand, Cand, ConvergingScheduler::BotHeightReduce)) + return true; + } + if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), + TryCand, Cand, ConvergingScheduler::BotPathReduce)) + return true; + } + return false; +} + /// Apply a set of heursitics to a new candidate. Heuristics are currently /// hierarchical. This may be more efficient than a graduated cost model because /// we don't need to evaluate all aspects of the model for each node in the @@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, RegExcess)) return; + // For loops that are acyclic path limited, aggressively schedule for latency. + if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) + return; + // Avoid increasing the max critical pressure in the scheduled region. if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, TryCand, Cand, RegCritical)) @@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, return; // Avoid serializing long latency dependence chains. - if (Cand.Policy.ReduceLatency) { - if (Zone.isTop()) { - if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, TopDepthReduce)) - return; - } - if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, TopPathReduce)) - return; - } - else { - if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, BotHeightReduce)) - return; - } - if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, BotPathReduce)) - return; - } + // For acyclic path limited loops, latency was already checked above. + if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited + && tryLatency(TryCand, Cand, Zone)) { + return; } // Prefer immediate defs/users of the last scheduled instruction. This is a diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 24714089da4..0b5eb0ebe89 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -36,6 +36,8 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include + using namespace llvm; static cl::opt EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, @@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, PendingLoads.clear(); } +/// Compute the max cyclic critical path through the DAG. For loops that span +/// basic blocks, MachineTraceMetrics should be used for this instead. +unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() { + // This only applies to single block loop. + if (!BB->isSuccessor(BB)) + return 0; + + unsigned MaxCyclicLatency = 0; + // Visit each live out vreg def to find def/use pairs that cross iterations. + for (SUnit::const_pred_iterator + PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) { + MachineInstr *MI = PI->getSUnit()->getInstr(); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + break; + unsigned Reg = MO.getReg(); + if (!Reg || TRI->isPhysicalRegister(Reg)) + continue; + + const LiveInterval &LI = LIS->getInterval(Reg); + unsigned LiveOutHeight = PI->getSUnit()->getHeight(); + unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency(); + // Visit all local users of the vreg def. + for (VReg2UseMap::iterator + UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { + if (UI->SU == &ExitSU) + continue; + + // Only consider uses of the phi. + LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr())); + if (!LRQ.valueIn()->isPHIDef()) + continue; + + // Cheat a bit and assume that a path spanning two iterations is a + // cycle, which could overestimate in strange cases. This allows cyclic + // latency to be estimated as the minimum height or depth slack. + unsigned CyclicLatency = 0; + if (LiveOutDepth > UI->SU->getDepth()) + CyclicLatency = LiveOutDepth - UI->SU->getDepth(); + unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency(); + if (LiveInHeight > LiveOutHeight) { + if (LiveInHeight - LiveOutHeight < CyclicLatency) + CyclicLatency = LiveInHeight - LiveOutHeight; + } + else + CyclicLatency = 0; + DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum + << ") -> SU(" << UI->SU->NodeNum << ") = " + << CyclicLatency << "\n"); + if (CyclicLatency > MaxCyclicLatency) + MaxCyclicLatency = CyclicLatency; + } + } + } + DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n"); + return MaxCyclicLatency; +} + void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) SU->getInstr()->dump();