MIsched: add an ILP window property to machine model.

This was an experimental option, but needs to be defined
per-target. e.g. PPC A2 needs to aggressively hide latency.

I converted some in-order scheduling tests to A2. Hal is working on
more test cases.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171946 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrew Trick
2013-01-09 03:36:49 +00:00
parent 2af949dddd
commit 47579cf390
9 changed files with 51 additions and 27 deletions

View File

@ -84,6 +84,9 @@ public:
/// \brief Maximum number of micro-ops that may be scheduled per cycle. /// \brief Maximum number of micro-ops that may be scheduled per cycle.
unsigned getIssueWidth() const { return SchedModel.IssueWidth; } unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
/// \brief Number of cycles the OOO processor is expected to hide.
unsigned getILPWindow() const { return SchedModel.ILPWindow; }
/// \brief Return the number of issue slots required for this MI. /// \brief Return the number of issue slots required for this MI.
unsigned getNumMicroOps(const MachineInstr *MI, unsigned getNumMicroOps(const MachineInstr *MI,
const MCSchedClassDesc *SC = 0) const; const MCSchedClassDesc *SC = 0) const;

View File

@ -155,7 +155,7 @@ public:
// Optional InstrItinerary OperandCycles provides expected latency. // Optional InstrItinerary OperandCycles provides expected latency.
// TODO: can't yet specify both min and expected latency per operand. // TODO: can't yet specify both min and expected latency per operand.
int MinLatency; int MinLatency;
static const unsigned DefaultMinLatency = -1; static const int DefaultMinLatency = -1;
// LoadLatency is the expected latency of load instructions. // LoadLatency is the expected latency of load instructions.
// //
@ -172,6 +172,16 @@ public:
unsigned HighLatency; unsigned HighLatency;
static const unsigned DefaultHighLatency = 10; static const unsigned DefaultHighLatency = 10;
// ILPWindow is the number of cycles that the scheduler effectively ignores
// before attempting to hide latency. This should be zero for in-order cpus to
// always hide expected latency. For out-of-order cpus, it may be tweaked as
// desired to roughly approximate instruction buffers. The actual threshold is
// not very important for an OOO processor, as long as it isn't too high. A
// nonzero value helps avoid rescheduling to hide latency when its is fairly
// obviously useless and makes register pressure heuristics more effective.
unsigned ILPWindow;
static const unsigned DefaultILPWindow = 0;
// MispredictPenalty is the typical number of extra cycles the processor // MispredictPenalty is the typical number of extra cycles the processor
// takes to recover from a branch misprediction. // takes to recover from a branch misprediction.
unsigned MispredictPenalty; unsigned MispredictPenalty;
@ -196,6 +206,7 @@ public:
MinLatency(DefaultMinLatency), MinLatency(DefaultMinLatency),
LoadLatency(DefaultLoadLatency), LoadLatency(DefaultLoadLatency),
HighLatency(DefaultHighLatency), HighLatency(DefaultHighLatency),
ILPWindow(DefaultILPWindow),
MispredictPenalty(DefaultMispredictPenalty), MispredictPenalty(DefaultMispredictPenalty),
ProcID(0), ProcResourceTable(0), SchedClassTable(0), ProcID(0), ProcResourceTable(0), SchedClassTable(0),
NumProcResourceKinds(0), NumSchedClasses(0), NumProcResourceKinds(0), NumSchedClasses(0),
@ -205,12 +216,12 @@ public:
} }
// Table-gen driven ctor. // Table-gen driven ctor.
MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp, MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp,
unsigned pi, const MCProcResourceDesc *pr, unsigned mp, unsigned pi, const MCProcResourceDesc *pr,
const MCSchedClassDesc *sc, unsigned npr, unsigned nsc, const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
const InstrItinerary *ii): const InstrItinerary *ii):
IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl), IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr), ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc), SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
InstrItineraries(ii) {} InstrItineraries(ii) {}

View File

@ -76,6 +76,7 @@ class SchedMachineModel {
int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle. int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
int MinLatency = -1; // Determines which instrucions are allowed in a group. int MinLatency = -1; // Determines which instrucions are allowed in a group.
// (-1) inorder (0) ooo, (1): inorder +var latencies. // (-1) inorder (0) ooo, (1): inorder +var latencies.
int ILPWindow = -1; // Cycles of latency likely hidden by hardware buffers.
int LoadLatency = -1; // Cycles for loads to access the cache. int LoadLatency = -1; // Cycles for loads to access the cache.
int HighLatency = -1; // Approximation of cycles for "high latency" ops. int HighLatency = -1; // Approximation of cycles for "high latency" ops.
int MispredictPenalty = -1; // Extra cycles for a mispredicted branch. int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.

View File

@ -48,15 +48,6 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
static bool ViewMISchedDAGs = false; static bool ViewMISchedDAGs = false;
#endif // NDEBUG #endif // NDEBUG
// Threshold to very roughly model an out-of-order processor's instruction
// buffers. If the actual value of this threshold matters much in practice, then
// it can be specified by the machine model. For now, it's an experimental
// tuning knob to determine when and if it matters.
static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
cl::desc("Allow expected latency to exceed the critical path by N cycles "
"before attempting to balance ILP"),
cl::init(10U));
// Experimental heuristics // Experimental heuristics
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
cl::desc("Enable load clustering."), cl::init(true)); cl::desc("Enable load clustering."), cl::init(true));
@ -1297,7 +1288,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
if (L > RemLatency) if (L > RemLatency)
RemLatency = L; RemLatency = L;
} }
if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
if (RemLatency + ExpectedLatency >= CriticalPathLimit
&& RemLatency > Rem->getMaxRemainingCount(SchedModel)) { && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
Policy.ReduceLatency = true; Policy.ReduceLatency = true;
DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n'); DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');

View File

@ -1887,6 +1887,9 @@ def CortexA9Model : SchedMachineModel {
let LoadLatency = 2; // Optimistic load latency assuming bypass. let LoadLatency = 2; // Optimistic load latency assuming bypass.
// This is overriden by OperandCycles if the // This is overriden by OperandCycles if the
// Itineraries are queried instead. // Itineraries are queried instead.
let ILPWindow = 10; // Don't reschedule small blocks to hide
// latency. Minimum latency requirements are already
// modeled strictly by reserving resources.
let MispredictPenalty = 8; // Based on estimate of pipeline depth. let MispredictPenalty = 8; // Based on estimate of pipeline depth.
let Itineraries = CortexA9Itineraries; let Itineraries = CortexA9Itineraries;

View File

@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards, // latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact. // they do not need to be exact.
// //
// ILPWindow=10 is an arbitrary threshold that approximates cycles of
// latency hidden by instruction buffers. The actual value is not very
// important but should be zero for inorder and nonzero for OOO processors.
//
// The GenericModel contains no instruciton itineraries. // The GenericModel contains no instruciton itineraries.
def GenericModel : SchedMachineModel { def GenericModel : SchedMachineModel {
let IssueWidth = 4; let IssueWidth = 4;
let MinLatency = 0; let MinLatency = 0;
let LoadLatency = 4; let LoadLatency = 4;
let HighLatency = 10; let HighLatency = 10;
let ILPWindow = 10;
} }
include "X86ScheduleAtom.td" include "X86ScheduleAtom.td"

View File

@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel {
// OperandCycles may be used for expected latency. // OperandCycles may be used for expected latency.
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles.
let ILPWindow = 0; // Always try to hide expected latency.
let Itineraries = AtomItineraries; let Itineraries = AtomItineraries;
} }

View File

@ -1,15 +1,15 @@
; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \ ; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \
; RUN: -pre-RA-sched=source -scheditins=false -ilp-window=0 \
; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s ; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
; ;
; For these tests, we set -ilp-window=0 to simulate in order processor. target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-bgq-linux"
; %val1 is a 3-cycle load live out of %entry. It should be hoisted ; %val1 is a load live out of %entry. It should be hoisted
; above the add. ; above the add.
; CHECK: @testload ; CHECK: testload:
; CHECK: %entry ; CHECK: %entry
; CHECK: ldr ; CHECK: lwz
; CHECK: adds ; CHECK: addi
; CHECK: bne ; CHECK: bne
; CHECK: %true ; CHECK: %true
define i32 @testload(i32 *%ptr, i32 %sumin) { define i32 @testload(i32 *%ptr, i32 %sumin) {
@ -34,15 +34,22 @@ end:
; The prefetch gets a default latency of 3 cycles and should be hoisted ; The prefetch gets a default latency of 3 cycles and should be hoisted
; above the add. ; above the add.
; ;
; CHECK: @testprefetch ; CHECK: testprefetch:
; CHECK: %entry ; CHECK: %entry
; CHECK: pld ; CHECK: dcbt
; CHECK: adds ; CHECK: addi
; CHECK: bx ; CHECK: blr
define i32 @testprefetch(i8 *%ptr, i32 %i) { define i32 @testprefetch(i8 *%ptr, i32 %i) {
entry: entry:
%tmp = add i32 %i, 1 %val1 = add i32 %i, 1
tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 ) tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
ret i32 %tmp %p = icmp eq i32 %i, 0
br i1 %p, label %true, label %end
true:
%val2 = add i32 %val1, 1
br label %end
end:
%valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
ret i32 %valmerge
} }
declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind

View File

@ -1108,6 +1108,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ',');
EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ','); EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
OS << " " << PI->Index << ", // Processor ID\n"; OS << " " << PI->Index << ", // Processor ID\n";
if (PI->hasInstrSchedModel()) if (PI->hasInstrSchedModel())