MIsched: add an ILP window property to machine model.

This was an experimental option, but needs to be defined per-target. e.g. PPC A2 needs to aggressively hide latency. I converted some in-order scheduling tests to A2. Hal is working on more test cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171946 91177308-0d34-0410-b5e6-96231b3b80d8
2025-09-14 15:25:25 +00:00 · 2013-01-09 03:36:49 +00:00
parent 2af949dddd
commit 47579cf390
9 changed files with 51 additions and 27 deletions
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -84,6 +84,9 @@ public:
  /// \brief Maximum number of micro-ops that may be scheduled per cycle.
  unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
  /// \brief Number of cycles the OOO processor is expected to hide.
  unsigned getILPWindow() const { return SchedModel.ILPWindow; }
  /// \brief Return the number of issue slots required for this MI.
  unsigned getNumMicroOps(const MachineInstr *MI,
                          const MCSchedClassDesc *SC = 0) const;
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -155,7 +155,7 @@ public:
  //      Optional InstrItinerary OperandCycles provides expected latency.
  //      TODO: can't yet specify both min and expected latency per operand.
  int MinLatency;
-  static const unsigned DefaultMinLatency = -1;
+  static const int DefaultMinLatency = -1;
  // LoadLatency is the expected latency of load instructions.
  //
@@ -172,6 +172,16 @@ public:
  unsigned HighLatency;
  static const unsigned DefaultHighLatency = 10;
  // ILPWindow is the number of cycles that the scheduler effectively ignores
  // before attempting to hide latency. This should be zero for in-order cpus to
  // always hide expected latency. For out-of-order cpus, it may be tweaked as
  // desired to roughly approximate instruction buffers. The actual threshold is
  // not very important for an OOO processor, as long as it isn't too high. A
  // nonzero value helps avoid rescheduling to hide latency when its is fairly
  // obviously useless and makes register pressure heuristics more effective.
  unsigned ILPWindow;
  static const unsigned DefaultILPWindow = 0;
  // MispredictPenalty is the typical number of extra cycles the processor
  // takes to recover from a branch misprediction.
  unsigned MispredictPenalty;
@@ -196,6 +206,7 @@ public:
                  MinLatency(DefaultMinLatency),
                  LoadLatency(DefaultLoadLatency),
                  HighLatency(DefaultHighLatency),
                  ILPWindow(DefaultILPWindow),
                  MispredictPenalty(DefaultMispredictPenalty),
                  ProcID(0), ProcResourceTable(0), SchedClassTable(0),
                  NumProcResourceKinds(0), NumSchedClasses(0),
@@ -205,12 +216,12 @@ public:
  }
  // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
+  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp,
-               unsigned pi, const MCProcResourceDesc *pr,
+               unsigned mp, unsigned pi, const MCProcResourceDesc *pr,
               const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
               const InstrItinerary *ii):
    IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
+    ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
    SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
    InstrItineraries(ii) {}
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -76,6 +76,7 @@ class SchedMachineModel {
  int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
  int MinLatency = -1; // Determines which instrucions are allowed in a group.
                       // (-1) inorder (0) ooo, (1): inorder +var latencies.
  int ILPWindow = -1;  // Cycles of latency likely hidden by hardware buffers.
  int LoadLatency = -1; // Cycles for loads to access the cache.
  int HighLatency = -1; // Approximation of cycles for "high latency" ops.
  int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -48,15 +48,6 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 // Threshold to very roughly model an out-of-order processor's instruction
 // buffers. If the actual value of this threshold matters much in practice, then
 // it can be specified by the machine model. For now, it's an experimental
 // tuning knob to determine when and if it matters.
 static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
  cl::desc("Allow expected latency to exceed the critical path by N cycles "
           "before attempting to balance ILP"),
  cl::init(10U));
 // Experimental heuristics
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
  cl::desc("Enable load clustering."), cl::init(true));
@@ -1297,7 +1288,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
    if (L > RemLatency)
      RemLatency = L;
  }
-  if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow
+  unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
  if (RemLatency + ExpectedLatency >= CriticalPathLimit
      && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
    Policy.ReduceLatency = true;
    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1887,6 +1887,9 @@ def CortexA9Model : SchedMachineModel {
  let LoadLatency = 2; // Optimistic load latency assuming bypass.
                       // This is overriden by OperandCycles if the
                       // Itineraries are queried instead.
  let ILPWindow = 10; // Don't reschedule small blocks to hide
                      // latency. Minimum latency requirements are already
                      // modeled strictly by reserving resources.
  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
  let Itineraries = CortexA9Itineraries;
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
 // ILPWindow=10 is an arbitrary threshold that approximates cycles of
 // latency hidden by instruction buffers. The actual value is not very
 // important but should be zero for inorder and nonzero for OOO processors.
 //
 // The GenericModel contains no instruciton itineraries.
 def GenericModel : SchedMachineModel {
  let IssueWidth = 4;
  let MinLatency = 0;
  let LoadLatency = 4;
  let HighLatency = 10;
  let ILPWindow = 10;
 }
 include "X86ScheduleAtom.td"
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel {
                       // OperandCycles may be used for expected latency.
  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
  let ILPWindow = 0; // Always try to hide expected latency.
  let Itineraries = AtomItineraries;
 }
--- a/test/CodeGen/PowerPC/misched-inorder-latency.ll
+++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \
+; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \
 ; RUN:          -pre-RA-sched=source -scheditins=false -ilp-window=0 \
 ; RUN:          -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
 ;
-; For these tests, we set -ilp-window=0 to simulate in order processor.
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-bgq-linux"
-; %val1 is a 3-cycle load live out of %entry. It should be hoisted
+; %val1 is a load live out of %entry. It should be hoisted
 ; above the add.
-; CHECK: @testload
+; CHECK: testload:
 ; CHECK: %entry
-; CHECK: ldr
+; CHECK: lwz
-; CHECK: adds
+; CHECK: addi
 ; CHECK: bne
 ; CHECK: %true
 define i32 @testload(i32 *%ptr, i32 %sumin) {
@@ -34,15 +34,22 @@ end:
 ; The prefetch gets a default latency of 3 cycles and should be hoisted
 ; above the add.
 ;
-; CHECK: @testprefetch
+; CHECK: testprefetch:
 ; CHECK: %entry
-; CHECK: pld
+; CHECK: dcbt
-; CHECK: adds
+; CHECK: addi
-; CHECK: bx
+; CHECK: blr
 define i32 @testprefetch(i8 *%ptr, i32 %i) {
 entry:
-  %tmp = add i32 %i, 1
+  %val1 = add i32 %i, 1
  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
-  ret i32 %tmp
+  %p = icmp eq i32 %i, 0
  br i1 %p, label %true, label %end
 true:
  %val2 = add i32 %val1, 1
  br label %end
 end:
  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
  ret i32 %valmerge
 }
 declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1108,6 +1108,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
    EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
    EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
    EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
    EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ',');
    EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
    OS << "  " << PI->Index << ", // Processor ID\n";
    if (PI->hasInstrSchedModel())