From 3b87f6204fe094610282eea4c8ad7ea4e331d8db Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Wed, 7 Nov 2012 07:05:09 +0000 Subject: [PATCH] misched: Heuristics based on the machine model. misched is disabled by default. With -enable-misched, these heuristics balance the schedule to simultaneously avoid saturating processor resources, expose ILP, and minimize register pressure. I've been analyzing the performance of these heuristics on everything in the llvm test suite in addition to a few other benchmarks. I would like each heuristic check to be verified by a unit test, but I'm still trying to figure out the best way to do that. The heuristics are still in considerable flux, but as they are refined we should be rigorous about unit testing the improvements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167527 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MachineScheduler.h | 9 +- lib/CodeGen/MachineScheduler.cpp | 912 ++++++++++++++++++++---- test/CodeGen/X86/misched-balance.ll | 230 ++++++ 3 files changed, 1002 insertions(+), 149 deletions(-) create mode 100644 test/CodeGen/X86/misched-balance.ll diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index 2b96c7abe42..31bd606f932 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -154,6 +154,8 @@ public: bool empty() const { return Queue.empty(); } + void clear() { Queue.clear(); } + unsigned size() const { return Queue.size(); } typedef std::vector::iterator iterator; @@ -171,10 +173,12 @@ public: SU->NodeQueueId |= ID; } - void remove(iterator I) { + iterator remove(iterator I) { (*I)->NodeQueueId &= ~ID; *I = Queue.back(); + unsigned idx = I - Queue.begin(); Queue.pop_back(); + return Queue.begin() + idx; } #ifndef NDEBUG @@ -306,6 +310,9 @@ protected: /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues. void placeDebugValues(); + /// \brief dump the scheduled Sequence. + void dumpSchedule() const; + // Lesser helpers... void initRegPressure(); diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 2438eb10873..de16932c061 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -49,6 +49,15 @@ static cl::opt MISchedCutoff("misched-cutoff", cl::Hidden, static bool ViewMISchedDAGs = false; #endif // NDEBUG +// Threshold to very roughly model an out-of-order processor's instruction +// buffers. If the actual value of this threshold matters much in practice, then +// it can be specified by the machine model. For now, it's an experimental +// tuning knob to determine when and if it matters. +static cl::opt ILPWindow("ilp-window", cl::Hidden, + cl::desc("Allow expected latency to exceed the critical path by N cycles " + "before attempting to balance ILP"), + cl::init(10U)); + //===----------------------------------------------------------------------===// // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// @@ -487,6 +496,13 @@ void ScheduleDAGMI::schedule() { assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); placeDebugValues(); + + DEBUG({ + unsigned BBNum = top()->getParent()->getNumber(); + dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); } /// Build the DAG and setup three register pressure trackers. @@ -627,6 +643,17 @@ void ScheduleDAGMI::placeDebugValues() { FirstDbgValue = NULL; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void ScheduleDAGMI::dumpSchedule() const { + for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { + if (SUnit *SU = getSUnit(&(*MI))) + SU->dump(this); + else + dbgs() << "Missing SUnit\n"; + } +} +#endif + //===----------------------------------------------------------------------===// // ConvergingScheduler - Implementation of the standard MachineSchedStrategy. //===----------------------------------------------------------------------===// @@ -635,33 +662,127 @@ namespace { /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance /// the schedule. class ConvergingScheduler : public MachineSchedStrategy { +public: + /// Represent the type of SchedCandidate found within a single queue. + /// pickNodeBidirectional depends on these listed by decreasing priority. + enum CandReason { + NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand, + BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, + SingleMax, MultiPressure, NextDefUse, NodeOrder}; + +#ifndef NDEBUG + static const char *getReasonStr(ConvergingScheduler::CandReason Reason); +#endif + + /// Policy for scheduling the next instruction in the candidate's zone. + struct CandPolicy { + bool ReduceLatency; + unsigned ReduceResIdx; + unsigned DemandResIdx; + + CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {} + }; + + /// Status of an instruction's critical resource consumption. + struct SchedResourceDelta { + // Count critical resources in the scheduled region required by SU. + unsigned CritResources; + + // Count critical resources from another region consumed by SU. + unsigned DemandedResources; + + SchedResourceDelta(): CritResources(0), DemandedResources(0) {} + + bool operator==(const SchedResourceDelta &RHS) const { + return CritResources == RHS.CritResources + && DemandedResources == RHS.DemandedResources; + } + bool operator!=(const SchedResourceDelta &RHS) const { + return !operator==(RHS); + } + }; /// Store the state used by ConvergingScheduler heuristics, required for the /// lifetime of one invocation of pickNode(). struct SchedCandidate { + CandPolicy Policy; + // The best SUnit candidate. SUnit *SU; + // The reason for this candidate. + CandReason Reason; + // Register pressure values for the best candidate. RegPressureDelta RPDelta; - SchedCandidate(): SU(NULL) {} + // Critical resource consumption of the best candidate. + SchedResourceDelta ResDelta; + + SchedCandidate(const CandPolicy &policy) + : Policy(policy), SU(NULL), Reason(NoCand) {} + + bool isValid() const { return SU; } + + // Copy the status of another candidate without changing policy. + void setBest(SchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + SU = Best.SU; + Reason = Best.Reason; + RPDelta = Best.RPDelta; + ResDelta = Best.ResDelta; + } + + void initResourceDelta(const ScheduleDAGMI *DAG, + const TargetSchedModel *SchedModel); + }; + + /// Summarize the unscheduled region. + struct SchedRemainder { + // Critical path through the DAG in expected latency. + unsigned CriticalPath; + + // Unscheduled resources + SmallVector RemainingCounts; + // Critical resource for the unscheduled zone. + unsigned CritResIdx; + // Number of micro-ops left to schedule. + unsigned RemainingMicroOps; + // Is the unscheduled zone resource limited. + bool IsResourceLimited; + + unsigned MaxRemainingCount; + + void reset() { + CriticalPath = 0; + RemainingCounts.clear(); + CritResIdx = 0; + RemainingMicroOps = 0; + IsResourceLimited = false; + MaxRemainingCount = 0; + } + + SchedRemainder() { reset(); } + + void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel); }; - /// Represent the type of SchedCandidate found within a single queue. - enum CandResult { - NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure }; /// Each Scheduling boundary is associated with ready queues. It tracks the - /// current cycle in whichever direction at has moved, and maintains the state + /// current cycle in the direction of movement, and maintains the state /// of "hazards" and other interlocks at the current cycle. struct SchedBoundary { ScheduleDAGMI *DAG; const TargetSchedModel *SchedModel; + SchedRemainder *Rem; ReadyQueue Available; ReadyQueue Pending; bool CheckPending; + // For heuristics, keep a list of the nodes that immediately depend on the + // most recently scheduled node. + SmallPtrSet NextSUs; + ScheduleHazardRecognizer *HazardRec; unsigned CurrCycle; @@ -670,34 +791,88 @@ class ConvergingScheduler : public MachineSchedStrategy { /// MinReadyCycle - Cycle of the soonest available instruction. unsigned MinReadyCycle; + // The expected latency of the critical path in this scheduled zone. + unsigned ExpectedLatency; + + // Resources used in the scheduled zone beyond this boundary. + SmallVector ResourceCounts; + + // Cache the critical resources ID in this scheduled zone. + unsigned CritResIdx; + + // Is the scheduled region resource limited vs. latency limited. + bool IsResourceLimited; + + unsigned ExpectedCount; + + // Policy flag: attempt to find ILP until expected latency is covered. + bool ShouldIncreaseILP; + +#ifndef NDEBUG // Remember the greatest min operand latency. unsigned MaxMinLatency; +#endif + + void reset() { + Available.clear(); + Pending.clear(); + CheckPending = false; + NextSUs.clear(); + HazardRec = 0; + CurrCycle = 0; + IssueCount = 0; + MinReadyCycle = UINT_MAX; + ExpectedLatency = 0; + ResourceCounts.resize(1); + assert(!ResourceCounts[0] && "nonzero count for bad resource"); + CritResIdx = 0; + IsResourceLimited = false; + ExpectedCount = 0; + ShouldIncreaseILP = false; +#ifndef NDEBUG + MaxMinLatency = 0; +#endif + // Reserve a zero-count for invalid CritResIdx. + ResourceCounts.resize(1); + } /// Pending queues extend the ready queues with the same ID and the /// PendingFlag set. SchedBoundary(unsigned ID, const Twine &Name): - DAG(0), SchedModel(0), Available(ID, Name+".A"), - Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"), - CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0), - MinReadyCycle(UINT_MAX), MaxMinLatency(0) {} + DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"), + Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") { + reset(); + } ~SchedBoundary() { delete HazardRec; } - void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel) { - DAG = dag; - SchedModel = smodel; - } + void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, + SchedRemainder *rem); bool isTop() const { return Available.getID() == ConvergingScheduler::TopQID; } + unsigned getUnscheduledLatency(SUnit *SU) const { + if (isTop()) + return SU->getHeight(); + return SU->getDepth(); + } + + unsigned getCriticalCount() const { + return ResourceCounts[CritResIdx]; + } + bool checkHazard(SUnit *SU); + void checkILPPolicy(); + void releaseNode(SUnit *SU, unsigned ReadyCycle); void bumpCycle(); + void countResource(unsigned PIdx, unsigned Cycles); + void bumpNode(SUnit *SU); void releasePending(); @@ -707,11 +882,13 @@ class ConvergingScheduler : public MachineSchedStrategy { SUnit *pickOnlyChoice(); }; +private: ScheduleDAGMI *DAG; const TargetSchedModel *SchedModel; const TargetRegisterInfo *TRI; // State of the top and bottom scheduled instruction boundaries. + SchedRemainder Rem; SchedBoundary Top; SchedBoundary Bot; @@ -736,25 +913,75 @@ public: virtual void releaseBottomNode(SUnit *SU); -protected: - SUnit *pickNodeBidrectional(bool &IsTopNode); + virtual void registerRoots(); + +protected: + void balanceZones( + ConvergingScheduler::SchedBoundary &CriticalZone, + ConvergingScheduler::SchedCandidate &CriticalCand, + ConvergingScheduler::SchedBoundary &OppositeZone, + ConvergingScheduler::SchedCandidate &OppositeCand); + + void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand, + ConvergingScheduler::SchedCandidate &BotCand); + + void tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary &Zone, + const RegPressureTracker &RPTracker, + RegPressureTracker &TempTracker); + + SUnit *pickNodeBidirectional(bool &IsTopNode); + + void pickNodeFromQueue(SchedBoundary &Zone, + const RegPressureTracker &RPTracker, + SchedCandidate &Candidate); - CandResult pickNodeFromQueue(ReadyQueue &Q, - const RegPressureTracker &RPTracker, - SchedCandidate &Candidate); #ifndef NDEBUG - void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU, - PressureElement P = PressureElement()); + void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone); #endif }; } // namespace +void ConvergingScheduler::SchedRemainder:: +init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { + reset(); + if (!SchedModel->hasInstrSchedModel()) + return; + RemainingCounts.resize(SchedModel->getNumProcResourceKinds()); + for (std::vector::iterator + I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { + const MCSchedClassDesc *SC = DAG->getSchedClass(&*I); + RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC); + for (TargetSchedModel::ProcResIter + PI = SchedModel->getWriteProcResBegin(SC), + PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { + unsigned PIdx = PI->ProcResourceIdx; + unsigned Factor = SchedModel->getResourceFactor(PIdx); + RemainingCounts[PIdx] += (Factor * PI->Cycles); + } + } +} + +void ConvergingScheduler::SchedBoundary:: +init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { + reset(); + DAG = dag; + SchedModel = smodel; + Rem = rem; + if (SchedModel->hasInstrSchedModel()) + ResourceCounts.resize(SchedModel->getNumProcResourceKinds()); +} + void ConvergingScheduler::initialize(ScheduleDAGMI *dag) { DAG = dag; SchedModel = DAG->getSchedModel(); TRI = DAG->TRI; - Top.init(DAG, SchedModel); - Bot.init(DAG, SchedModel); + Rem.init(DAG, SchedModel); + Top.init(DAG, SchedModel, &Rem); + Bot.init(DAG, SchedModel, &Rem); + + // Initialize resource counts. // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or // are disabled, then these HazardRecs will be disabled. @@ -803,6 +1030,17 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) { Bot.releaseNode(SU, SU->BotReadyCycle); } +void ConvergingScheduler::registerRoots() { + Rem.CriticalPath = DAG->ExitSU.getDepth(); + // Some roots may not feed into ExitSU. Check all of them in case. + for (std::vector::const_iterator + I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { + if ((*I)->getDepth() > Rem.CriticalPath) + Rem.CriticalPath = (*I)->getDepth(); + } + DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); +} + /// Does this SU have a hazard within the current instruction group. /// /// The scheduler supports two modes of hazard recognition. The first is the @@ -821,14 +1059,26 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) { return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); - if (IssueCount + uops > SchedModel->getIssueWidth()) + if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) { + DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" + << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); return true; - + } return false; } +/// If expected latency is covered, disable ILP policy. +void ConvergingScheduler::SchedBoundary::checkILPPolicy() { + if (ShouldIncreaseILP + && (IsResourceLimited || ExpectedLatency <= CurrCycle)) { + ShouldIncreaseILP = false; + DEBUG(dbgs() << "Disable ILP: " << Available.getName() << '\n'); + } +} + void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { + if (ReadyCycle < MinReadyCycle) MinReadyCycle = ReadyCycle; @@ -838,6 +1088,18 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, Pending.push(SU); else Available.push(SU); + + // Record this node as an immediate dependent of the scheduled node. + NextSUs.insert(SU); + + // If CriticalPath has been computed, then check if the unscheduled nodes + // exceed the ILP window. Before registerRoots, CriticalPath==0. + if (Rem->CriticalPath && (ExpectedLatency + getUnscheduledLatency(SU) + > Rem->CriticalPath + ILPWindow)) { + ShouldIncreaseILP = true; + DEBUG(dbgs() << "Increase ILP: " << Available.getName() << " " + << ExpectedLatency << " + " << getUnscheduledLatency(SU) << '\n'); + } } /// Move the boundary of scheduled code by one cycle. @@ -845,8 +1107,12 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() { unsigned Width = SchedModel->getIssueWidth(); IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; + unsigned NextCycle = CurrCycle + 1; assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); - unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); + if (MinReadyCycle > NextCycle) { + IssueCount = 0; + NextCycle = MinReadyCycle; + } if (!HazardRec->isEnabled()) { // Bypass HazardRec virtual calls. @@ -862,11 +1128,39 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() { } } CheckPending = true; + IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle); - DEBUG(dbgs() << "*** " << Available.getName() << " cycle " + DEBUG(dbgs() << " *** " << Available.getName() << " cycle " << CurrCycle << '\n'); } +/// Add the given processor resource to this scheduled zone. +void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx, + unsigned Cycles) { + unsigned Factor = SchedModel->getResourceFactor(PIdx); + DEBUG(dbgs() << " " << SchedModel->getProcResource(PIdx)->Name + << " +(" << Cycles << "x" << Factor + << ") / " << SchedModel->getLatencyFactor() << '\n'); + + unsigned Count = Factor * Cycles; + ResourceCounts[PIdx] += Count; + assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted"); + Rem->RemainingCounts[PIdx] -= Count; + + // Reset MaxRemainingCount for sanity. + Rem->MaxRemainingCount = 0; + + // Check if this resource exceeds the current critical resource by a full + // cycle. If so, it becomes the critical resource. + if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx]) + >= (int)SchedModel->getLatencyFactor()) { + CritResIdx = PIdx; + DEBUG(dbgs() << " *** Critical resource " + << SchedModel->getProcResource(PIdx)->Name << " x" + << ResourceCounts[PIdx] << '\n'); + } +} + /// Move the boundary of scheduled code by one SUnit. void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) { // Update the reservation table. @@ -878,11 +1172,38 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) { } HazardRec->EmitInstruction(SU); } + // Update resource counts and critical resource. + if (SchedModel->hasInstrSchedModel()) { + const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC); + for (TargetSchedModel::ProcResIter + PI = SchedModel->getWriteProcResBegin(SC), + PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { + countResource(PI->ProcResourceIdx, PI->Cycles); + } + } + if (isTop()) { + if (SU->getDepth() > ExpectedLatency) + ExpectedLatency = SU->getDepth(); + } + else { + if (SU->getHeight() > ExpectedLatency) + ExpectedLatency = SU->getHeight(); + } + + IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle); + // Check the instruction group dispatch limit. // TODO: Check if this SU must end a dispatch group. IssueCount += SchedModel->getNumMicroOps(SU->getInstr()); + + // checkHazard prevents scheduling multiple instructions per cycle that exceed + // issue width. However, we commonly reach the maximum. In this case + // opportunistically bump the cycle to avoid uselessly checking everything in + // the readyQ. Furthermore, a single instruction may produce more than one + // cycle's worth of micro-ops. if (IssueCount >= SchedModel->getIssueWidth()) { - DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); + DEBUG(dbgs() << " *** Max instrs at cycle " << CurrCycle << '\n'); bumpCycle(); } } @@ -913,6 +1234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() { Pending.remove(Pending.begin()+i); --i; --e; } + DEBUG(if (!Pending.empty()) Pending.dump()); CheckPending = false; } @@ -927,12 +1249,23 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) { } /// If this queue only has one ready candidate, return it. As a side effect, -/// advance the cycle until at least one node is ready. If multiple instructions -/// are ready, return NULL. +/// defer any nodes that now hit a hazard, and advance the cycle until at least +/// one node is ready. If multiple instructions are ready, return NULL. SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { if (CheckPending) releasePending(); + if (IssueCount > 0) { + // Defer any ready instrs that now have a hazard. + for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { + if (checkHazard(*I)) { + Pending.push(*I); + I = Available.remove(I); + continue; + } + ++I; + } + } for (unsigned i = 0; Available.empty(); ++i) { assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && "permanent hazard"); (void)i; @@ -944,18 +1277,262 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { return NULL; } -#ifndef NDEBUG -void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q, - SUnit *SU, PressureElement P) { - dbgs() << Label << " " << Q.getName() << " "; - if (P.isValid()) - dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease - << " "; - else - dbgs() << " "; - SU->dump(DAG); +/// Record the candidate policy for opposite zones with different critical +/// resources. +/// +/// If the CriticalZone is latency limited, don't force a policy for the +/// candidates here. Instead, When releasing each candidate, releaseNode +/// compares the region's critical path to the candidate's height or depth and +/// the scheduled zone's expected latency then sets ShouldIncreaseILP. +void ConvergingScheduler::balanceZones( + ConvergingScheduler::SchedBoundary &CriticalZone, + ConvergingScheduler::SchedCandidate &CriticalCand, + ConvergingScheduler::SchedBoundary &OppositeZone, + ConvergingScheduler::SchedCandidate &OppositeCand) { + + if (!CriticalZone.IsResourceLimited) + return; + + SchedRemainder *Rem = CriticalZone.Rem; + + // If the critical zone is overconsuming a resource relative to the + // remainder, try to reduce it. + unsigned RemainingCritCount = + Rem->RemainingCounts[CriticalZone.CritResIdx]; + if ((int)(Rem->MaxRemainingCount - RemainingCritCount) + > (int)SchedModel->getLatencyFactor()) { + CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx; + DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce " + << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name + << '\n'); + } + // If the other zone is underconsuming a resource relative to the full zone, + // try to increase it. + unsigned OppositeCount = + OppositeZone.ResourceCounts[CriticalZone.CritResIdx]; + if ((int)(OppositeZone.ExpectedCount - OppositeCount) + > (int)SchedModel->getLatencyFactor()) { + OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx; + DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand " + << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name + << '\n'); + } +} + +/// Determine if the scheduled zones exceed resource limits or critical path and +/// set each candidate's ReduceHeight policy accordingly. +void ConvergingScheduler::checkResourceLimits( + ConvergingScheduler::SchedCandidate &TopCand, + ConvergingScheduler::SchedCandidate &BotCand) { + + Bot.checkILPPolicy(); + Top.checkILPPolicy(); + if (Bot.ShouldIncreaseILP) + BotCand.Policy.ReduceLatency = true; + if (Top.ShouldIncreaseILP) + TopCand.Policy.ReduceLatency = true; + + // Handle resource-limited regions. + if (Top.IsResourceLimited && Bot.IsResourceLimited + && Top.CritResIdx == Bot.CritResIdx) { + // If the scheduled critical resource in both zones is no longer the + // critical remaining resource, attempt to reduce resource height both ways. + if (Top.CritResIdx != Rem.CritResIdx) { + TopCand.Policy.ReduceResIdx = Top.CritResIdx; + BotCand.Policy.ReduceResIdx = Bot.CritResIdx; + DEBUG(dbgs() << "Reduce scheduled " + << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n'); + } + return; + } + // Handle latency-limited regions. + if (!Top.IsResourceLimited && !Bot.IsResourceLimited) { + // If the total scheduled expected latency exceeds the region's critical + // path then reduce latency both ways. + // + // Just because a zone is not resource limited does not mean it is latency + // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle + // to exceed expected latency. + if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath) + && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) { + TopCand.Policy.ReduceLatency = true; + BotCand.Policy.ReduceLatency = true; + DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency + << " + " << Bot.ExpectedLatency << '\n'); + } + return; + } + // The critical resource is different in each zone, so request balancing. + + // Compute the cost of each zone. + Rem.MaxRemainingCount = std::max( + Rem.RemainingMicroOps * SchedModel->getMicroOpFactor(), + Rem.RemainingCounts[Rem.CritResIdx]); + Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle); + Top.ExpectedCount = std::max( + Top.getCriticalCount(), + Top.ExpectedCount * SchedModel->getLatencyFactor()); + Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle); + Bot.ExpectedCount = std::max( + Bot.getCriticalCount(), + Bot.ExpectedCount * SchedModel->getLatencyFactor()); + + balanceZones(Top, TopCand, Bot, BotCand); + balanceZones(Bot, BotCand, Top, TopCand); +} + +void ConvergingScheduler::SchedCandidate:: +initResourceDelta(const ScheduleDAGMI *DAG, + const TargetSchedModel *SchedModel) { + if (!Policy.ReduceResIdx && !Policy.DemandResIdx) + return; + + const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + for (TargetSchedModel::ProcResIter + PI = SchedModel->getWriteProcResBegin(SC), + PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { + if (PI->ProcResourceIdx == Policy.ReduceResIdx) + ResDelta.CritResources += PI->Cycles; + if (PI->ProcResourceIdx == Policy.DemandResIdx) + ResDelta.DemandedResources += PI->Cycles; + } +} + +/// Return true if this heuristic determines order. +static bool tryLess(unsigned TryVal, unsigned CandVal, + ConvergingScheduler::SchedCandidate &TryCand, + ConvergingScheduler::SchedCandidate &Cand, + ConvergingScheduler::CandReason Reason) { + if (TryVal < CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal > CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + return false; +} +static bool tryGreater(unsigned TryVal, unsigned CandVal, + ConvergingScheduler::SchedCandidate &TryCand, + ConvergingScheduler::SchedCandidate &Cand, + ConvergingScheduler::CandReason Reason) { + if (TryVal > CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal < CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + return false; +} + +/// Apply a set of heursitics to a new candidate. Heuristics are currently +/// hierarchical. This may be more efficient than a graduated cost model because +/// we don't need to evaluate all aspects of the model for each node in the +/// queue. But it's really done to make the heuristics easier to debug and +/// statistically analyze. +/// +/// \param Cand provides the policy and current best candidate. +/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. +/// \param Zone describes the scheduled zone that we are extending. +/// \param RPTracker describes reg pressure within the scheduled zone. +/// \param TempTracker is a scratch pressure tracker to reuse in queries. +void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary &Zone, + const RegPressureTracker &RPTracker, + RegPressureTracker &TempTracker) { + + // Always initialize TryCand's RPDelta. + TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta, + DAG->getRegionCriticalPSets(), + DAG->getRegPressure().MaxSetPressure); + + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + // Avoid exceeding the target's limit. + if (tryLess(TryCand.RPDelta.Excess.UnitIncrease, + Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess)) + return; + if (Cand.Reason == SingleExcess) + Cand.Reason = MultiPressure; + + // Avoid increasing the max critical pressure in the scheduled region. + if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease, + Cand.RPDelta.CriticalMax.UnitIncrease, + TryCand, Cand, SingleCritical)) + return; + if (Cand.Reason == SingleCritical) + Cand.Reason = MultiPressure; + + // Avoid critical resource consumption and balance the schedule. + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, + TryCand, Cand, ResourceDemand)) + return; + + // Avoid serializing long latency dependence chains. + if (Cand.Policy.ReduceLatency) { + if (Zone.isTop()) { + if (Cand.SU->getDepth() * SchedModel->getLatencyFactor() + > Zone.ExpectedCount) { + if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), + TryCand, Cand, TopDepthReduce)) + return; + } + if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), + TryCand, Cand, TopPathReduce)) + return; + } + else { + if (Cand.SU->getHeight() * SchedModel->getLatencyFactor() + > Zone.ExpectedCount) { + if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), + TryCand, Cand, BotHeightReduce)) + return; + } + if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), + TryCand, Cand, BotPathReduce)) + return; + } + } + + // Avoid increasing the max pressure of the entire region. + if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease, + Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax)) + return; + if (Cand.Reason == SingleMax) + Cand.Reason = MultiPressure; + + // Prefer immediate defs/users of the last scheduled instruction. This is a + // nice pressure avoidance strategy that also conserves the processor's + // register renaming resources and keeps the machine code readable. + if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) { + TryCand.Reason = NextDefUse; + return; + } + if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) { + if (Cand.Reason > NextDefUse) + Cand.Reason = NextDefUse; + return; + } + // Fall through to original instruction order. + if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) + || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + } } -#endif /// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is /// more desirable than RHS from scheduling standpoint. @@ -966,109 +1543,143 @@ static bool compareRPDelta(const RegPressureDelta &LHS, // have UnitIncrease==0, so are neutral. // Avoid increasing the max critical pressure in the scheduled region. - if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) + if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) { + DEBUG(dbgs() << "RP excess top - bot: " + << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n'); return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease; - + } // Avoid increasing the max critical pressure in the scheduled region. - if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) + if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) { + DEBUG(dbgs() << "RP critical top - bot: " + << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease) + << '\n'); return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease; - + } // Avoid increasing the max pressure of the entire region. - if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) + if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) { + DEBUG(dbgs() << "RP current top - bot: " + << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease) + << '\n'); return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease; - + } return false; } +#ifndef NDEBUG +const char *ConvergingScheduler::getReasonStr( + ConvergingScheduler::CandReason Reason) { + switch (Reason) { + case NoCand: return "NOCAND "; + case SingleExcess: return "REG-EXCESS"; + case SingleCritical: return "REG-CRIT "; + case SingleMax: return "REG-MAX "; + case MultiPressure: return "REG-MULTI "; + case ResourceReduce: return "RES-REDUCE"; + case ResourceDemand: return "RES-DEMAND"; + case TopDepthReduce: return "TOP-DEPTH "; + case TopPathReduce: return "TOP-PATH "; + case BotHeightReduce:return "BOT-HEIGHT"; + case BotPathReduce: return "BOT-PATH "; + case NextDefUse: return "DEF-USE "; + case NodeOrder: return "ORDER "; + }; +} + +void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand, + const SchedBoundary &Zone) { + const char *Label = getReasonStr(Cand.Reason); + PressureElement P; + unsigned ResIdx = 0; + unsigned Latency = 0; + switch (Cand.Reason) { + default: + break; + case SingleExcess: + P = Cand.RPDelta.Excess; + break; + case SingleCritical: + P = Cand.RPDelta.CriticalMax; + break; + case SingleMax: + P = Cand.RPDelta.CurrentMax; + break; + case ResourceReduce: + ResIdx = Cand.Policy.ReduceResIdx; + break; + case ResourceDemand: + ResIdx = Cand.Policy.DemandResIdx; + break; + case TopDepthReduce: + Latency = Cand.SU->getDepth(); + break; + case TopPathReduce: + Latency = Cand.SU->getHeight(); + break; + case BotHeightReduce: + Latency = Cand.SU->getHeight(); + break; + case BotPathReduce: + Latency = Cand.SU->getDepth(); + break; + } + dbgs() << Label << " " << Zone.Available.getName() << " "; + if (P.isValid()) + dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease + << " "; + else + dbgs() << " "; + if (ResIdx) + dbgs() << SchedModel->getProcResource(ResIdx)->Name << " "; + else + dbgs() << " "; + if (Latency) + dbgs() << Latency << " cycles "; + else + dbgs() << " "; + Cand.SU->dump(DAG); +} +#endif + /// Pick the best candidate from the top queue. /// /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during /// DAG building. To adjust for the current scheduling location we need to /// maintain the number of vreg uses remaining to be top-scheduled. -ConvergingScheduler::CandResult ConvergingScheduler:: -pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker, - SchedCandidate &Candidate) { +void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand) { + ReadyQueue &Q = Zone.Available; + DEBUG(Q.dump()); // getMaxPressureDelta temporarily modifies the tracker. RegPressureTracker &TempTracker = const_cast(RPTracker); - // BestSU remains NULL if no top candidates beat the best existing candidate. - CandResult FoundCandidate = NoCand; for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { - RegPressureDelta RPDelta; - TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, - DAG->getRegionCriticalPSets(), - DAG->getRegPressure().MaxSetPressure); - // Initialize the candidate if needed. - if (!Candidate.SU) { - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - FoundCandidate = NodeOrder; - continue; - } - // Avoid exceeding the target's limit. - if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) { - DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - FoundCandidate = SingleExcess; - continue; - } - if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease) - continue; - if (FoundCandidate == SingleExcess) - FoundCandidate = MultiPressure; - - // Avoid increasing the max critical pressure in the scheduled region. - if (RPDelta.CriticalMax.UnitIncrease - < Candidate.RPDelta.CriticalMax.UnitIncrease) { - DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - FoundCandidate = SingleCritical; - continue; - } - if (RPDelta.CriticalMax.UnitIncrease - > Candidate.RPDelta.CriticalMax.UnitIncrease) - continue; - if (FoundCandidate == SingleCritical) - FoundCandidate = MultiPressure; - - // Avoid increasing the max pressure of the entire region. - if (RPDelta.CurrentMax.UnitIncrease - < Candidate.RPDelta.CurrentMax.UnitIncrease) { - DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - FoundCandidate = SingleMax; - continue; - } - if (RPDelta.CurrentMax.UnitIncrease - > Candidate.RPDelta.CurrentMax.UnitIncrease) - continue; - if (FoundCandidate == SingleMax) - FoundCandidate = MultiPressure; - - // Fall through to original instruction order. - // Only consider node order if Candidate was chosen from this Q. - if (FoundCandidate == NoCand) - continue; - - if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) - || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { - DEBUG(traceCandidate("NCAND", Q, *I)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - FoundCandidate = NodeOrder; + SchedCandidate TryCand(Cand.Policy); + TryCand.SU = *I; + tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(DAG, SchedModel); + Cand.setBest(TryCand); + DEBUG(traceCandidate(Cand, Zone)); } + TryCand.SU = *I; } - return FoundCandidate; +} + +static void tracePick(const ConvergingScheduler::SchedCandidate &Cand, + bool IsTop) { + DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot") + << " SU(" << Cand.SU->NodeNum << ") " + << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n'); } /// Pick the best candidate node from either the top or bottom queue. -SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) { +SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { @@ -1079,11 +1690,14 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) { IsTopNode = true; return SU; } - SchedCandidate BotCand; + CandPolicy NoPolicy; + SchedCandidate BotCand(NoPolicy); + SchedCandidate TopCand(NoPolicy); + checkResourceLimits(TopCand, BotCand); + // Prefer bottom scheduling when heuristics are silent. - CandResult BotResult = pickNodeFromQueue(Bot.Available, - DAG->getBotRPTracker(), BotCand); - assert(BotResult != NoCand && "failed to find the first candidate"); + pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); // If either Q has a single candidate that provides the least increase in // Excess pressure, we can immediately schedule from that Q. @@ -1092,37 +1706,41 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) { // affects picking from either Q. If scheduling in one direction must // increase pressure for one of the excess PSets, then schedule in that // direction first to provide more freedom in the other direction. - if (BotResult == SingleExcess || BotResult == SingleCritical) { + if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) { IsTopNode = false; + tracePick(BotCand, IsTopNode); return BotCand.SU; } // Check if the top Q has a better candidate. - SchedCandidate TopCand; - CandResult TopResult = pickNodeFromQueue(Top.Available, - DAG->getTopRPTracker(), TopCand); - assert(TopResult != NoCand && "failed to find the first candidate"); + pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); - if (TopResult == SingleExcess || TopResult == SingleCritical) { - IsTopNode = true; - return TopCand.SU; - } // If either Q has a single candidate that minimizes pressure above the // original region's pressure pick it. - if (BotResult == SingleMax) { + if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) { + if (TopCand.Reason < BotCand.Reason) { + IsTopNode = true; + tracePick(TopCand, IsTopNode); + return TopCand.SU; + } IsTopNode = false; + tracePick(BotCand, IsTopNode); return BotCand.SU; } - if (TopResult == SingleMax) { - IsTopNode = true; - return TopCand.SU; - } // Check for a salient pressure difference and pick the best from either side. if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) { IsTopNode = true; + tracePick(TopCand, IsTopNode); + return TopCand.SU; + } + // Otherwise prefer the bottom candidate, in node order if all else failed. + if (TopCand.Reason < BotCand.Reason) { + IsTopNode = true; + tracePick(TopCand, IsTopNode); return TopCand.SU; } - // Otherwise prefer the bottom candidate in node order. IsTopNode = false; + tracePick(BotCand, IsTopNode); return BotCand.SU; } @@ -1138,11 +1756,10 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { if (ForceTopDown) { SU = Top.pickOnlyChoice(); if (!SU) { - SchedCandidate TopCand; - CandResult TopResult = - pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand); - assert(TopResult != NoCand && "failed to find the first candidate"); - (void)TopResult; + CandPolicy NoPolicy; + SchedCandidate TopCand(NoPolicy); + pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); SU = TopCand.SU; } IsTopNode = true; @@ -1150,17 +1767,16 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { else if (ForceBottomUp) { SU = Bot.pickOnlyChoice(); if (!SU) { - SchedCandidate BotCand; - CandResult BotResult = - pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand); - assert(BotResult != NoCand && "failed to find the first candidate"); - (void)BotResult; + CandPolicy NoPolicy; + SchedCandidate BotCand(NoPolicy); + pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidrectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode); } } while (SU->isScheduled); diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll new file mode 100644 index 00000000000..2184d9e9603 --- /dev/null +++ b/test/CodeGen/X86/misched-balance.ll @@ -0,0 +1,230 @@ +; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN: -verify-machineinstrs | FileCheck %s +; +; Verify that misched resource/latency balancy heuristics are sane. + +define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94, + i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99, + i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104) + nounwind uwtable ssp { +entry: + br label %for.body + +; imull folded loads should be in order and interleaved with addl, never +; adjacent. Also check that we have no spilling. +; +; Since mmult1 IR is already in good order, this effectively ensure +; the scheduler maintains source order. +; +; CHECK: %for.body +; CHECK-NOT: %rsp +; CHECK: imull 4 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 8 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 12 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 16 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 20 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 24 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 28 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 32 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 36 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK-NOT: {{imull|rsp}} +; CHECK: %end +for.body: + %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ] + %tmp57 = load i32* %tmp56, align 4 + %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i + %tmp58 = load i32* %arrayidx12.us.i61, align 4 + %mul.us.i = mul nsw i32 %tmp58, %tmp57 + %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1 + %tmp59 = load i32* %arrayidx8.us.i.1, align 4 + %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i + %tmp60 = load i32* %arrayidx12.us.i61.1, align 4 + %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59 + %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i + %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2 + %tmp61 = load i32* %arrayidx8.us.i.2, align 4 + %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i + %tmp62 = load i32* %arrayidx12.us.i61.2, align 4 + %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61 + %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1 + %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3 + %tmp63 = load i32* %arrayidx8.us.i.3, align 4 + %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i + %tmp64 = load i32* %arrayidx12.us.i61.3, align 4 + %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63 + %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2 + %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4 + %tmp65 = load i32* %arrayidx8.us.i.4, align 4 + %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i + %tmp66 = load i32* %arrayidx12.us.i61.4, align 4 + %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65 + %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3 + %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5 + %tmp67 = load i32* %arrayidx8.us.i.5, align 4 + %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i + %tmp68 = load i32* %arrayidx12.us.i61.5, align 4 + %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67 + %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4 + %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6 + %tmp69 = load i32* %arrayidx8.us.i.6, align 4 + %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i + %tmp70 = load i32* %arrayidx12.us.i61.6, align 4 + %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69 + %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5 + %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7 + %tmp71 = load i32* %arrayidx8.us.i.7, align 4 + %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i + %tmp72 = load i32* %arrayidx12.us.i61.7, align 4 + %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71 + %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6 + %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8 + %tmp73 = load i32* %arrayidx8.us.i.8, align 4 + %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i + %tmp74 = load i32* %arrayidx12.us.i61.8, align 4 + %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73 + %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7 + %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9 + %tmp75 = load i32* %arrayidx8.us.i.9, align 4 + %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i + %tmp76 = load i32* %arrayidx12.us.i61.9, align 4 + %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75 + %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8 + %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i + store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4 + %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10 + br i1 %exitcond, label %end, label %for.body + +end: + ret void +} + +; Unlike the above loop, this IR starts out bad and must be +; rescheduled. +; +; CHECK: %for.body +; CHECK-NOT: %rsp +; CHECK: imull 4 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 8 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 12 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 16 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 20 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 24 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 28 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 32 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK: imull 36 +; CHECK-NOT: {{imull|rsp}} +; CHECK: addl +; CHECK-NOT: {{imull|rsp}} +; CHECK: %end +define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94, + i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99, + i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104) + nounwind uwtable ssp { +entry: + br label %for.body +for.body: + %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ] + %tmp57 = load i32* %tmp56, align 4 + %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i + %tmp58 = load i32* %arrayidx12.us.i61, align 4 + %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1 + %tmp59 = load i32* %arrayidx8.us.i.1, align 4 + %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i + %tmp60 = load i32* %arrayidx12.us.i61.1, align 4 + %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2 + %tmp61 = load i32* %arrayidx8.us.i.2, align 4 + %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i + %tmp62 = load i32* %arrayidx12.us.i61.2, align 4 + %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3 + %tmp63 = load i32* %arrayidx8.us.i.3, align 4 + %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i + %tmp64 = load i32* %arrayidx12.us.i61.3, align 4 + %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4 + %tmp65 = load i32* %arrayidx8.us.i.4, align 4 + %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i + %tmp66 = load i32* %arrayidx12.us.i61.4, align 4 + %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5 + %tmp67 = load i32* %arrayidx8.us.i.5, align 4 + %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i + %tmp68 = load i32* %arrayidx12.us.i61.5, align 4 + %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6 + %tmp69 = load i32* %arrayidx8.us.i.6, align 4 + %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i + %tmp70 = load i32* %arrayidx12.us.i61.6, align 4 + %mul.us.i = mul nsw i32 %tmp58, %tmp57 + %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7 + %tmp71 = load i32* %arrayidx8.us.i.7, align 4 + %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i + %tmp72 = load i32* %arrayidx12.us.i61.7, align 4 + %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8 + %tmp73 = load i32* %arrayidx8.us.i.8, align 4 + %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i + %tmp74 = load i32* %arrayidx12.us.i61.8, align 4 + %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9 + %tmp75 = load i32* %arrayidx8.us.i.9, align 4 + %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i + %tmp76 = load i32* %arrayidx12.us.i61.9, align 4 + %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59 + %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i + %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61 + %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1 + %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63 + %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2 + %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65 + %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3 + %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67 + %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4 + %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69 + %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5 + %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71 + %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6 + %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73 + %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7 + %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75 + %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8 + %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i + store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4 + %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10 + br i1 %exitcond, label %end, label %for.body + +end: + ret void +}