Add a hybrid bottom up scheduler that reduce register usage while avoiding

pipeline stall. It's useful for targets like ARM cortex-a8. NEON has a lot
of long latency instructions so a strict register pressure reduction
scheduler does not work well.
Early experiments show this speeds up some NEON loops by over 30%.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@104216 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2010-05-20 06:13:19 +00:00
parent 761fd4c1d9
commit 15a16def6e
8 changed files with 162 additions and 31 deletions

View File

@ -46,6 +46,7 @@ namespace {
(void) llvm::createBURRListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
(void) llvm::createTDRRListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
(void) llvm::createSourceListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
(void) llvm::createHybridListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
(void) llvm::createTDListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
(void) llvm::createFastDAGScheduler(NULL, llvm::CodeGenOpt::Default);
(void) llvm::createDefaultScheduler(NULL, llvm::CodeGenOpt::Default);

View File

@ -409,7 +409,9 @@ namespace llvm {
/// implementation to decide.
///
class SchedulingPriorityQueue {
unsigned CurCycle;
public:
SchedulingPriorityQueue() : CurCycle(0) {}
virtual ~SchedulingPriorityQueue() {}
virtual void initNodes(std::vector<SUnit> &SUnits) = 0;
@ -433,6 +435,14 @@ namespace llvm {
virtual void ScheduledNode(SUnit *) {}
virtual void UnscheduledNode(SUnit *) {}
void setCurCycle(unsigned Cycle) {
CurCycle = Cycle;
}
unsigned getCurCycle() const {
return CurCycle;
}
};
class ScheduleDAG {

View File

@ -73,11 +73,17 @@ ScheduleDAGSDNodes *createBURRListDAGScheduler(SelectionDAGISel *IS,
ScheduleDAGSDNodes *createTDRRListDAGScheduler(SelectionDAGISel *IS,
CodeGenOpt::Level OptLevel);
/// createBURRListDAGScheduler - This creates a bottom up register usage
/// reduction list scheduler that schedules in source code order when possible.
/// createBURRListDAGScheduler - This creates a bottom up list scheduler that
/// schedules nodes in source code order when possible.
ScheduleDAGSDNodes *createSourceListDAGScheduler(SelectionDAGISel *IS,
CodeGenOpt::Level OptLevel);
/// createHybridListDAGScheduler - This creates a bottom up hybrid register
/// usage reduction list scheduler that make use of latency information to
/// avoid stalls for long latency instructions.
ScheduleDAGSDNodes *createHybridListDAGScheduler(SelectionDAGISel *IS,
CodeGenOpt::Level);
/// createTDListDAGScheduler - This creates a top-down list scheduler with
/// a hazard recognizer.
ScheduleDAGSDNodes *createTDListDAGScheduler(SelectionDAGISel *IS,

View File

@ -73,7 +73,8 @@ namespace CodeGenOpt {
namespace Sched {
enum Preference {
Latency, // Scheduling for shortest total latency.
RegPressure // Scheduling for lowest register pressure.
RegPressure, // Scheduling for lowest register pressure.
Hybrid // Scheduling for both latency and register pressure.
};
}

View File

@ -53,6 +53,12 @@ static RegisterScheduler
"order when possible",
createSourceListDAGScheduler);
static RegisterScheduler
hybridListDAGScheduler("hybrid",
"Bottom-up rr list scheduling which avoid stalls for "
"long latency instructions",
createHybridListDAGScheduler);
namespace {
//===----------------------------------------------------------------------===//
/// ScheduleDAGRRList - The actual register reduction list scheduler
@ -64,6 +70,10 @@ private:
/// it is top-down.
bool isBottomUp;
/// NeedLatency - True if the scheduler will make use of latency information.
///
bool NeedLatency;
/// AvailableQueue - The priority queue to use for the available SUnits.
SchedulingPriorityQueue *AvailableQueue;
@ -80,9 +90,9 @@ private:
public:
ScheduleDAGRRList(MachineFunction &mf,
bool isbottomup,
bool isbottomup, bool needlatency,
SchedulingPriorityQueue *availqueue)
: ScheduleDAGSDNodes(mf), isBottomUp(isbottomup),
: ScheduleDAGSDNodes(mf), isBottomUp(isbottomup), NeedLatency(needlatency),
AvailableQueue(availqueue), Topo(SUnits) {
}
@ -161,9 +171,11 @@ private:
return NewNode;
}
/// ForceUnitLatencies - Return true, since register-pressure-reducing
/// scheduling doesn't need actual latency information.
bool ForceUnitLatencies() const { return true; }
/// ForceUnitLatencies - Register-pressure-reducing scheduling doesn't
/// need actual latency information but the hybrid scheduler does.
bool ForceUnitLatencies() const {
return !NeedLatency;
}
};
} // end anonymous namespace
@ -213,6 +225,12 @@ void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
#endif
--PredSU->NumSuccsLeft;
if (!ForceUnitLatencies()) {
// Updating predecessor's height. This is now the cycle when the
// predecessor can be scheduled without causing a pipeline stall.
PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
}
// If all the node's successors are scheduled, this node is ready
// to be scheduled. Ignore the special EntrySU node.
if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
@ -244,10 +262,15 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
/// count of its predecessors. If a predecessor pending count is zero, add it to
/// the Available queue.
void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
DEBUG(SU->dump(this));
assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
#ifndef NDEBUG
if (CurCycle < SU->getHeight())
DEBUG(dbgs() << " Height [" << SU->getHeight() << "] pipeline stall!\n");
#endif
// FIXME: Handle noop hazard.
SU->setHeightToAtLeast(CurCycle);
Sequence.push_back(SU);
@ -339,6 +362,7 @@ void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, unsigned BtCycle,
SU->isAvailable = false;
UnscheduleNodeBottomUp(OldSU);
--CurCycle;
AvailableQueue->setCurCycle(CurCycle);
}
assert(!SU->isSucc(OldSU) && "Something is wrong!");
@ -386,7 +410,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
return NULL;
DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
assert(NewNodes.size() == 2 && "Expected a load folding node!");
N = NewNodes[1];
@ -504,7 +528,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
SU = NewSU;
}
DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n");
NewSU = CreateClone(SU);
// New SUnit has the exact same predecessors.
@ -786,7 +810,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
// Issue copies, these can be expensive cross register class copies.
SmallVector<SUnit*, 2> Copies;
InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
DEBUG(dbgs() << "Adding an edge from SU #" << TrySU->NodeNum
DEBUG(dbgs() << " Adding an edge from SU #" << TrySU->NodeNum
<< " to SU #" << Copies.front()->NodeNum << "\n");
AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
/*Reg=*/0, /*isNormalMemory=*/false,
@ -795,7 +819,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
NewDef = Copies.back();
}
DEBUG(dbgs() << "Adding an edge from SU #" << NewDef->NodeNum
DEBUG(dbgs() << " Adding an edge from SU #" << NewDef->NodeNum
<< " to SU #" << TrySU->NodeNum << "\n");
LiveRegDefs[Reg] = NewDef;
AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
@ -821,6 +845,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
if (CurSU)
ScheduleNodeBottomUp(CurSU, CurCycle);
++CurCycle;
AvailableQueue->setCurCycle(CurCycle);
}
// Reverse the order if it is bottom up.
@ -889,6 +914,7 @@ void ScheduleDAGRRList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
/// schedulers.
void ScheduleDAGRRList::ListScheduleTopDown() {
unsigned CurCycle = 0;
AvailableQueue->setCurCycle(CurCycle);
// Release any successors of the special Entry node.
ReleaseSuccessors(&EntrySU);
@ -911,6 +937,7 @@ void ScheduleDAGRRList::ListScheduleTopDown() {
if (CurSU)
ScheduleNodeTopDown(CurSU, CurCycle);
++CurCycle;
AvailableQueue->setCurCycle(CurCycle);
}
#ifndef NDEBUG
@ -956,6 +983,16 @@ namespace {
bool operator()(const SUnit* left, const SUnit* right) const;
};
struct hybrid_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
RegReductionPriorityQueue<hybrid_ls_rr_sort> *SPQ;
hybrid_ls_rr_sort(RegReductionPriorityQueue<hybrid_ls_rr_sort> *spq)
: SPQ(spq) {}
hybrid_ls_rr_sort(const hybrid_ls_rr_sort &RHS)
: SPQ(RHS.SPQ) {}
bool operator()(const SUnit* left, const SUnit* right) const;
};
} // end anonymous namespace
/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
@ -991,7 +1028,7 @@ namespace {
template<class SF>
class RegReductionPriorityQueue : public SchedulingPriorityQueue {
PriorityQueue<SUnit*, std::vector<SUnit*>, SF> Queue;
unsigned currentQueueId;
unsigned CurQueueId;
protected:
// SUnits - The SUnits for the current graph.
@ -1007,7 +1044,7 @@ namespace {
public:
RegReductionPriorityQueue(const TargetInstrInfo *tii,
const TargetRegisterInfo *tri)
: Queue(SF(this)), currentQueueId(0),
: Queue(SF(this)), CurQueueId(0),
TII(tii), TRI(tri), scheduleDAG(NULL) {}
void initNodes(std::vector<SUnit> &sunits) {
@ -1067,14 +1104,14 @@ namespace {
unsigned getNodeOrdering(const SUnit *SU) const {
return scheduleDAG->DAG->GetOrdering(SU->getNode());
}
unsigned size() const { return Queue.size(); }
bool empty() const { return Queue.empty(); }
void push(SUnit *U) {
assert(!U->NodeQueueId && "Node in the queue already");
U->NodeQueueId = ++currentQueueId;
U->NodeQueueId = ++CurQueueId;
Queue.push(U);
}
@ -1117,6 +1154,9 @@ namespace {
typedef RegReductionPriorityQueue<src_ls_rr_sort>
SrcRegReductionPriorityQueue;
typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
HybridBURRPriorityQueue;
}
/// closestSucc - Returns the scheduled cycle of the successor which is
@ -1203,7 +1243,7 @@ bool bu_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
}
// Source order, otherwise bottom up.
bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
unsigned LOrder = SPQ->getNodeOrdering(left);
unsigned ROrder = SPQ->getNodeOrdering(right);
@ -1215,6 +1255,23 @@ bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
return BURRSort(left, right, SPQ);
}
bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
bool LStall = SPQ->getCurCycle() < left->getHeight();
bool RStall = SPQ->getCurCycle() < right->getHeight();
// If scheduling one of the node will cause a pipeline stall, delay it.
// If scheduling either one of the node will cause a pipeline stall, sort them
// according to their height.
// If neither will cause a pipeline stall, try to reduce register pressure.
if (LStall) {
if (!RStall)
return true;
if (left->getHeight() != right->getHeight())
return left->getHeight() > right->getHeight();
} else if (RStall)
return false;
return BURRSort(left, right, SPQ);
}
template<class SF>
bool
RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
@ -1379,8 +1436,8 @@ void RegReductionPriorityQueue<SF>::PrescheduleNodesWithMultipleUses() {
// Ok, the transformation is safe and the heuristics suggest it is
// profitable. Update the graph.
DEBUG(dbgs() << "Prescheduling SU # " << SU->NodeNum
<< " next to PredSU # " << PredSU->NodeNum
DEBUG(dbgs() << " Prescheduling SU #" << SU->NodeNum
<< " next to PredSU #" << PredSU->NodeNum
<< " to guide scheduling in the presence of multiple uses\n");
for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
SDep Edge = PredSU->Succs[i];
@ -1469,7 +1526,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
(hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
(!SU->isCommutable && SuccSU->isCommutable)) &&
!scheduleDAG->IsReachable(SuccSU, SU)) {
DEBUG(dbgs() << "Adding a pseudo-two-addr edge from SU # "
DEBUG(dbgs() << " Adding a pseudo-two-addr edge from SU #"
<< SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
/*Reg=*/0, /*isNormalMemory=*/false,
@ -1563,8 +1620,7 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
BURegReductionPriorityQueue *PQ = new BURegReductionPriorityQueue(TII, TRI);
ScheduleDAGRRList *SD =
new ScheduleDAGRRList(*IS->MF, true, PQ);
ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
PQ->setScheduleDAG(SD);
return SD;
}
@ -1577,8 +1633,7 @@ llvm::createTDRRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
TDRegReductionPriorityQueue *PQ = new TDRegReductionPriorityQueue(TII, TRI);
ScheduleDAGRRList *SD =
new ScheduleDAGRRList(*IS->MF, false, PQ);
ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, false, PQ);
PQ->setScheduleDAG(SD);
return SD;
}
@ -1591,8 +1646,20 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
SrcRegReductionPriorityQueue *PQ = new SrcRegReductionPriorityQueue(TII, TRI);
ScheduleDAGRRList *SD =
new ScheduleDAGRRList(*IS->MF, true, PQ);
ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
PQ->setScheduleDAG(SD);
return SD;
}
llvm::ScheduleDAGSDNodes *
llvm::createHybridListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
const TargetMachine &TM = IS->TM;
const TargetInstrInfo *TII = TM.getInstrInfo();
const TargetRegisterInfo *TRI = TM.getRegisterInfo();
HybridBURRPriorityQueue *PQ = new HybridBURRPriorityQueue(TII, TRI);
ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
PQ->setScheduleDAG(SD);
return SD;
}

View File

@ -347,7 +347,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
const SDep& dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
OpSU->Latency, PhysReg);
if (!isChain && !UnitLatencies) {
ComputeOperandLatency(OpSU, SU, const_cast<SDep &>(dep));
ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
}
@ -378,6 +378,10 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
}
const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
if (InstrItins.isEmpty()) {
SU->Latency = 1;
return;
}
// Compute the latency for the node. We use the sum of the latencies for
// all nodes flagged together into this SUnit.
@ -389,6 +393,37 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
}
}
void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
unsigned OpIdx, SDep& dep) const{
// Check to see if the scheduler cares about latencies.
if (ForceUnitLatencies())
return;
const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
if (InstrItins.isEmpty())
return;
if (dep.getKind() != SDep::Data)
return;
unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
if (Def->isMachineOpcode() && Use->isMachineOpcode()) {
const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
if (DefIdx >= II.getNumDefs())
return;
int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
if (DefCycle < 0)
return;
const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
int UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
if (UseCycle >= 0) {
int Latency = DefCycle - UseCycle + 1;
if (Latency >= 0)
dep.setLatency(Latency);
}
}
}
void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
if (!SU->getNode()) {
dbgs() << "PHYS REG COPY\n";

View File

@ -94,6 +94,15 @@ namespace llvm {
///
virtual void ComputeLatency(SUnit *SU);
/// ComputeOperandLatency - Override dependence edge latency using
/// operand use/def information
///
virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
SDep& dep) const { }
virtual void ComputeOperandLatency(SDNode *Def, SDNode *Use,
unsigned OpIdx, SDep& dep) const;
virtual MachineBasicBlock *EmitSchedule();
/// Schedule - Order nodes according to selected style, filling

View File

@ -134,9 +134,11 @@ namespace llvm {
return createFastDAGScheduler(IS, OptLevel);
if (TLI.getSchedulingPreference() == Sched::Latency)
return createTDListDAGScheduler(IS, OptLevel);
assert(TLI.getSchedulingPreference() == Sched::RegPressure &&
if (TLI.getSchedulingPreference() == Sched::RegPressure)
return createBURRListDAGScheduler(IS, OptLevel);
assert(TLI.getSchedulingPreference() == Sched::Hybrid &&
"Unknown sched type!");
return createBURRListDAGScheduler(IS, OptLevel);
return createHybridListDAGScheduler(IS, OptLevel);
}
}