diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index ccba1b0364e..48ed232a15c 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -292,6 +292,7 @@ namespace llvm { bool isScheduleHigh : 1; // True if preferable to schedule high. bool isScheduleLow : 1; // True if preferable to schedule low. bool isCloned : 1; // True if this node has been cloned. + bool isUnbuffered : 1; // Reads an unbuffered resource. Sched::Preference SchedulingPref; // Scheduling preference. private: @@ -316,9 +317,10 @@ namespace llvm { isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), - isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), - isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} + isScheduleLow(false), isCloned(false), isUnbuffered(false), + SchedulingPref(Sched::None), isDepthCurrent(false), + isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), + BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// SUnit - Construct an SUnit for post-regalloc scheduling to represent /// a MachineInstr. @@ -330,9 +332,10 @@ namespace llvm { isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), - isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), - isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} + isScheduleLow(false), isCloned(false), isUnbuffered(false), + SchedulingPref(Sched::None), isDepthCurrent(false), + isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), + BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// SUnit - Construct a placeholder SUnit. SUnit() @@ -343,9 +346,10 @@ namespace llvm { isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), - isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), - isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} + isScheduleLow(false), isCloned(false), isUnbuffered(false), + SchedulingPref(Sched::None), isDepthCurrent(false), + isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), + BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// \brief Boundary nodes are placeholders for the boundary of the /// scheduling region. diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index f8c4a893c12..3296149afa8 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -1330,7 +1330,7 @@ public: /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason { - NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax, + NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; @@ -1583,6 +1583,10 @@ public: MaxExecutedResCount); } + /// Get the difference between the given SUnit's ready time and the current + /// cycle. + unsigned getLatencyStallCycles(SUnit *SU); + bool checkHazard(SUnit *SU); unsigned findMaxLatency(ArrayRef ReadySUs); @@ -1869,6 +1873,23 @@ void GenericScheduler::registerRoots() { } } +/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat +/// these "soft stalls" differently than the hard stall cycles based on CPU +/// resources and computed by checkHazard(). A fully in-order model +/// (MicroOpBufferSize==0) will not make use of this since instructions are not +/// available for scheduling until they are ready. However, a weaker in-order +/// model may use this for heuristics. For example, if a processor has in-order +/// behavior when reading certain resources, this may come into play. +unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) { + if (!SU->isUnbuffered) + return 0; + + unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); + if (ReadyCycle > CurrCycle) + return ReadyCycle - CurrCycle; + return 0; +} + /// Does this SU have a hazard within the current instruction group. /// /// The scheduler supports two modes of hazard recognition. The first is the @@ -1948,9 +1969,9 @@ getOtherResourceCount(unsigned &OtherCritIdx) { /// inside and outside the zone. void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone) { - // Now that potential stalls have been considered, apply preemptive heuristics - // based on the the total latency and resources inside and outside this - // zone. + // Apply preemptive heuristics based on the the total latency and resources + // inside and outside this zone. Potential stalls should be considered before + // following this policy. // Compute remaining latency. We need this both to determine whether the // overall schedule has become latency-limited and whether the instructions @@ -2141,7 +2162,11 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) { break; default: // We don't currently model the OOO reorder buffer, so consider all - // scheduled MOps to be "retired". + // scheduled MOps to be "retired". We do loosely model in-order resource + // latency. If this instruction uses an in-order resource, account for any + // likely stall cycles. + if (SU->isUnbuffered && ReadyCycle > NextCycle) + NextCycle = ReadyCycle; break; } RetiredMOps += IncMOps; @@ -2514,6 +2539,11 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, && tryLatency(TryCand, Cand, Zone)) return; + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone.getLatencyStallCycles(TryCand.SU), + Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return; + // Keep clustered nodes together to encourage downstream peephole // optimizations which may reduce resource requirements. // @@ -2577,6 +2607,7 @@ const char *GenericScheduler::getReasonStr( case PhysRegCopy: return "PREG-COPY"; case RegExcess: return "REG-EXCESS"; case RegCritical: return "REG-CRIT "; + case Stall: return "STALL "; case Cluster: return "CLUSTER "; case Weak: return "WEAK "; case RegMax: return "REG-MAX "; diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 7d9160ab1f4..eeae6ec03d8 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -687,6 +687,22 @@ void ScheduleDAGInstrs::initSUnits() { // Assign the Latency field of SU using target-provided information. SU->Latency = SchedModel.computeInstrLatency(SU->getInstr()); + + // If this SUnit uses an unbuffered resource, mark it as such. + // These resources are used for in-order execution pipelines within an + // out-of-order core and are identified by BufferSize=1. BufferSize=0 is + // used for dispatch/issue groups and is not considered here. + if (SchedModel.hasInstrSchedModel()) { + const MCSchedClassDesc *SC = getSchedClass(SU); + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { + if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) { + SU->isUnbuffered = true; + break; + } + } + } } } diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 6276cfc200d..7d114a36b9a 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1905,7 +1905,7 @@ def A9UnitALU : ProcResource<2>; def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; } def A9UnitAGU : ProcResource<1>; def A9UnitLS : ProcResource<1>; -def A9UnitFP : ProcResource<1> { let BufferSize = 0; } +def A9UnitFP : ProcResource<1> { let BufferSize = 1; } def A9UnitB : ProcResource<1>; //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/ARM/saxpy10-a9.ll b/test/CodeGen/ARM/saxpy10-a9.ll new file mode 100644 index 00000000000..1102800dce0 --- /dev/null +++ b/test/CodeGen/ARM/saxpy10-a9.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false | FileCheck %s +; +; Test MI-Sched suppory latency based stalls on in in-order pipeline +; using the new machine model. + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +; Don't be too strict with the top of the schedule, but most of it +; should be nicely pipelined. +; +; CHECK: saxpy10: +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vmov +; CHECK-NEXT: bx +; +; This accumulates a sum rather than storing each result. +define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) { +entry: + %0 = load float* %data1, align 4 + %mul = fmul float %0, %a + %1 = load float* %data2, align 4 + %add = fadd float %mul, %1 + %add2 = fadd float %add, 0.000000e+00 + %arrayidx.1 = getelementptr inbounds float* %data1, i32 1 + %2 = load float* %arrayidx.1, align 4 + %mul.1 = fmul float %2, %a + %arrayidx1.1 = getelementptr inbounds float* %data2, i32 1 + %3 = load float* %arrayidx1.1, align 4 + %add.1 = fadd float %mul.1, %3 + %add2.1 = fadd float %add2, %add.1 + %arrayidx.2 = getelementptr inbounds float* %data1, i32 2 + %4 = load float* %arrayidx.2, align 4 + %mul.2 = fmul float %4, %a + %arrayidx1.2 = getelementptr inbounds float* %data2, i32 2 + %5 = load float* %arrayidx1.2, align 4 + %add.2 = fadd float %mul.2, %5 + %add2.2 = fadd float %add2.1, %add.2 + %arrayidx.3 = getelementptr inbounds float* %data1, i32 3 + %6 = load float* %arrayidx.3, align 4 + %mul.3 = fmul float %6, %a + %arrayidx1.3 = getelementptr inbounds float* %data2, i32 3 + %7 = load float* %arrayidx1.3, align 4 + %add.3 = fadd float %mul.3, %7 + %add2.3 = fadd float %add2.2, %add.3 + %arrayidx.4 = getelementptr inbounds float* %data1, i32 4 + %8 = load float* %arrayidx.4, align 4 + %mul.4 = fmul float %8, %a + %arrayidx1.4 = getelementptr inbounds float* %data2, i32 4 + %9 = load float* %arrayidx1.4, align 4 + %add.4 = fadd float %mul.4, %9 + %add2.4 = fadd float %add2.3, %add.4 + %arrayidx.5 = getelementptr inbounds float* %data1, i32 5 + %10 = load float* %arrayidx.5, align 4 + %mul.5 = fmul float %10, %a + %arrayidx1.5 = getelementptr inbounds float* %data2, i32 5 + %11 = load float* %arrayidx1.5, align 4 + %add.5 = fadd float %mul.5, %11 + %add2.5 = fadd float %add2.4, %add.5 + %arrayidx.6 = getelementptr inbounds float* %data1, i32 6 + %12 = load float* %arrayidx.6, align 4 + %mul.6 = fmul float %12, %a + %arrayidx1.6 = getelementptr inbounds float* %data2, i32 6 + %13 = load float* %arrayidx1.6, align 4 + %add.6 = fadd float %mul.6, %13 + %add2.6 = fadd float %add2.5, %add.6 + %arrayidx.7 = getelementptr inbounds float* %data1, i32 7 + %14 = load float* %arrayidx.7, align 4 + %mul.7 = fmul float %14, %a + %arrayidx1.7 = getelementptr inbounds float* %data2, i32 7 + %15 = load float* %arrayidx1.7, align 4 + %add.7 = fadd float %mul.7, %15 + %add2.7 = fadd float %add2.6, %add.7 + %arrayidx.8 = getelementptr inbounds float* %data1, i32 8 + %16 = load float* %arrayidx.8, align 4 + %mul.8 = fmul float %16, %a + %arrayidx1.8 = getelementptr inbounds float* %data2, i32 8 + %17 = load float* %arrayidx1.8, align 4 + %add.8 = fadd float %mul.8, %17 + %add2.8 = fadd float %add2.7, %add.8 + %arrayidx.9 = getelementptr inbounds float* %data1, i32 9 + %18 = load float* %arrayidx.9, align 4 + %mul.9 = fmul float %18, %a + %arrayidx1.9 = getelementptr inbounds float* %data2, i32 9 + %19 = load float* %arrayidx1.9, align 4 + %add.9 = fadd float %mul.9, %19 + %add2.9 = fadd float %add2.8, %add.9 + ret float %add2.9 +}