From 6606ef0e98855e9e46404241eedebacb3b424976 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Thu, 5 Dec 2013 17:56:02 +0000
Subject: [PATCH] MI-Sched: Model "reserved" processor resources.

This allows a target to use MI-Sched as an in-order scheduler that
will model strict resource conflicts without defining a processor
itinerary. Instead, the target can now use the new per-operand machine
model and define in-order resources with BufferSize=0. For example,
this would allow restricting the type of operations that can be formed
into a dispatch group. (Normally NumMicroOps is sufficient to enforce
dispatch groups).

If the intent is to model latency in in-order pipeline, as opposed to
resource conflicts, then a resource with BufferSize=1 should be
defined instead.

This feature is only casually tested as there are no in-tree targets
using it yet. However, Hal will be experimenting with POWER7.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196517 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ScheduleDAG.h | 21 +++----
 lib/CodeGen/MachineScheduler.cpp   | 93 ++++++++++++++++++++++++------
 lib/CodeGen/ScheduleDAGInstrs.cpp  |  8 ++-
 3 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 48ed232a15c..66bf8c5dd77 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -292,7 +292,8 @@ namespace llvm {
     bool isScheduleHigh   : 1;          // True if preferable to schedule high.
     bool isScheduleLow    : 1;          // True if preferable to schedule low.
     bool isCloned         : 1;          // True if this node has been cloned.
-    bool isUnbuffered     : 1;          // Reads an unbuffered resource.
+    bool isUnbuffered     : 1;          // Uses an unbuffered resource.
+    bool hasReservedResource : 1;       // Uses a reserved resource.
     Sched::Preference SchedulingPref;   // Scheduling preference.
 
   private:
@@ -318,9 +319,9 @@ namespace llvm {
         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
         isAvailable(false), isScheduled(false), isScheduleHigh(false),
         isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+        hasReservedResource(false), SchedulingPref(Sched::None),
+        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
     /// a MachineInstr.
@@ -333,9 +334,9 @@ namespace llvm {
         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
         isAvailable(false), isScheduled(false), isScheduleHigh(false),
         isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+        hasReservedResource(false), SchedulingPref(Sched::None),
+        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
@@ -347,9 +348,9 @@ namespace llvm {
         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
         isAvailable(false), isScheduled(false), isScheduleHigh(false),
         isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+        hasReservedResource(false), SchedulingPref(Sched::None),
+        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
+        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
     /// \brief Boundary nodes are placeholders for the boundary of the
     /// scheduling region.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 3296149afa8..6cfedcbbc2e 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1322,6 +1322,8 @@ void CopyConstrain::apply(ScheduleDAGMI *DAG) {
 // GenericScheduler - Implementation of the generic MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
+static const unsigned InvalidCycle = ~0U;
+
 namespace {
 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
@@ -1491,6 +1493,10 @@ public:
     // Is the scheduled region resource limited vs. latency limited.
     bool IsResourceLimited;
 
+    // Record the highest cycle at which each resource has been reserved by a
+    // scheduled instruction.
+    SmallVector<unsigned, 16> ReservedCycles;
+
 #ifndef NDEBUG
     // Remember the greatest operand latency as an upper bound on the number of
     // times we should retry the pending queue because of a hazard.
@@ -1518,6 +1524,7 @@ public:
       MaxExecutedResCount = 0;
       ZoneCritResIdx = 0;
       IsResourceLimited = false;
+      ReservedCycles.clear();
 #ifndef NDEBUG
       MaxObservedLatency = 0;
 #endif
@@ -1587,6 +1594,8 @@ public:
     /// cycle.
     unsigned getLatencyStallCycles(SUnit *SU);
 
+    unsigned getNextResourceCycle(unsigned PIdx, unsigned Cycles);
+
     bool checkHazard(SUnit *SU);
 
     unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
@@ -1708,8 +1717,10 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
   DAG = dag;
   SchedModel = smodel;
   Rem = rem;
-  if (SchedModel->hasInstrSchedModel())
+  if (SchedModel->hasInstrSchedModel()) {
     ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
+    ReservedCycles.resize(SchedModel->getNumProcResourceKinds(), InvalidCycle);
+  }
 }
 
 /// Initialize the per-region scheduling policy.
@@ -1890,6 +1901,20 @@ unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) {
   return 0;
 }
 
+/// Compute the next cycle at which the given processor resource can be
+/// scheduled.
+unsigned GenericScheduler::SchedBoundary::
+getNextResourceCycle(unsigned PIdx, unsigned Cycles) {
+  unsigned NextUnreserved = ReservedCycles[PIdx];
+  // If this resource has never been used, always return cycle zero.
+  if (NextUnreserved == InvalidCycle)
+    return 0;
+  // For bottom-up scheduling add the cycles needed for the current operation.
+  if (!isTop())
+    NextUnreserved += Cycles;
+  return NextUnreserved;
+}
+
 /// Does this SU have a hazard within the current instruction group.
 ///
 /// The scheduler supports two modes of hazard recognition. The first is the
@@ -1913,6 +1938,15 @@ bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) {
           << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
   }
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      if (getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles) > CurrCycle)
+        return true;
+    }
+  }
   return false;
 }
 
@@ -2097,7 +2131,7 @@ void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
 /// \return the next cycle at which the instruction may execute without
 /// oversubscribing resources.
 unsigned GenericScheduler::SchedBoundary::
-countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
+countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {
   unsigned Factor = SchedModel->getResourceFactor(PIdx);
   unsigned Count = Factor * Cycles;
   DEBUG(dbgs() << "  " << getResourceName(PIdx)
@@ -2116,8 +2150,14 @@ countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
           << getResourceName(PIdx) << ": "
           << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
   }
-  // TODO: We don't yet model reserved resources. It's not hard though.
-  return CurrCycle;
+  // For reserved resources, record the highest cycle using the resource.
+  unsigned NextAvailable = getNextResourceCycle(PIdx, Cycles);
+  if (NextAvailable > CurrCycle) {
+    DEBUG(dbgs() << "  Resource conflict: "
+          << SchedModel->getProcResource(PIdx)->Name << " reserved until @"
+          << NextAvailable << "\n");
+  }
+  return NextAvailable;
 }
 
 /// Move the boundary of scheduled code by one SUnit.
@@ -2131,25 +2171,17 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     }
     HazardRec->EmitInstruction(SU);
   }
+  // checkHazard should prevent scheduling multiple instructions per cycle that
+  // exceed the issue width.
   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
   unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
-  CurrMOps += IncMOps;
-  // checkHazard prevents scheduling multiple instructions per cycle that exceed
-  // issue width. However, we commonly reach the maximum. In this case
-  // opportunistically bump the cycle to avoid uselessly checking everything in
-  // the readyQ. Furthermore, a single instruction may produce more than one
-  // cycle's worth of micro-ops.
-  //
-  // TODO: Also check if this SU must end a dispatch group.
-  unsigned NextCycle = CurrCycle;
-  if (CurrMOps >= SchedModel->getIssueWidth()) {
-    ++NextCycle;
-    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
-          << " at cycle " << CurrCycle << '\n');
-  }
+  assert(CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth() &&
+         "Cannot scheduling this instructions MicroOps in the current cycle.");
+
   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
   DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
 
+  unsigned NextCycle = CurrCycle;
   switch (SchedModel->getMicroOpBufferSize()) {
   case 0:
     assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
@@ -2194,10 +2226,23 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
            PI = SchedModel->getWriteProcResBegin(SC),
            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
       unsigned RCycle =
-        countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle);
+        countResource(PI->ProcResourceIdx, PI->Cycles, NextCycle);
       if (RCycle > NextCycle)
         NextCycle = RCycle;
     }
+    if (SU->hasReservedResource) {
+      // For reserved resources, record the highest cycle using the resource.
+      // For top-down scheduling, this is the cycle in which we schedule this
+      // instruction plus the number of cycles the operations reserves the
+      // resource. For bottom-up is it simply the instruction's cycle.
+      for (TargetSchedModel::ProcResIter
+             PI = SchedModel->getWriteProcResBegin(SC),
+             PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+        unsigned PIdx = PI->ProcResourceIdx;
+        if (SchedModel->getProcResource(PIdx)->BufferSize == 0)
+          ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle;
+      }
+    }
   }
   // Update ExpectedLatency and DependentLatency.
   unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
@@ -2224,6 +2269,16 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
       (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
       > (int)LFactor;
   }
+  // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
+  // resets CurrMOps. Loop to handle instructions with more MOps than issue in
+  // one cycle.  Since we commonly reach the max MOps here, opportunistically
+  // bump the cycle to avoid uselessly checking everything in the readyQ.
+  CurrMOps += IncMOps;
+  while (CurrMOps >= SchedModel->getIssueWidth()) {
+    bumpCycle(++NextCycle);
+    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
+          << " at cycle " << CurrCycle << '\n');
+  }
   DEBUG(dumpScheduledState());
 }
 
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index eeae6ec03d8..977b8f0b41d 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -697,9 +697,15 @@ void ScheduleDAGInstrs::initSUnits() {
       for (TargetSchedModel::ProcResIter
              PI = SchedModel.getWriteProcResBegin(SC),
              PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
-        if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) {
+        switch (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize) {
+        case 0:
+          SU->hasReservedResource = true;
+          break;
+        case 1:
           SU->isUnbuffered = true;
           break;
+        default:
+          break;
         }
       }
     }