mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-12 02:33:33 +00:00
Avoiding overly aggressive latency scheduling. If the two nodes share an
operand and one of them has a single use that is a live out copy, favor the one that is live out. Otherwise it will be difficult to eliminate the copy if the instruction is a loop induction variable update. e.g. BB: sub r1, r3, #1 str r0, [r2, r3] mov r3, r1 cmp bne BB => BB: str r0, [r2, r3] sub r3, r3, #1 cmp bne BB This fixed the recent 256.bzip2 regression. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@117675 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d7e473c629
commit
089751535d
@ -190,7 +190,7 @@ private:
|
||||
void ScheduleDAGRRList::Schedule() {
|
||||
DEBUG(dbgs()
|
||||
<< "********** List Scheduling BB#" << BB->getNumber()
|
||||
<< " **********\n");
|
||||
<< " '" << BB->getName() << "' **********\n");
|
||||
|
||||
NumLiveRegs = 0;
|
||||
LiveRegDefs.resize(TRI->getNumRegs(), NULL);
|
||||
@ -1483,6 +1483,46 @@ static unsigned calcMaxScratches(const SUnit *SU) {
|
||||
return Scratches;
|
||||
}
|
||||
|
||||
/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a
|
||||
/// CopyToReg to a virtual register. This SU def is probably a liveout and
|
||||
/// it has no other use. It should be scheduled closer to the terminator.
|
||||
static bool hasOnlyLiveOutUses(const SUnit *SU) {
|
||||
bool RetVal = false;
|
||||
for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
|
||||
I != E; ++I) {
|
||||
if (I->isCtrl()) continue;
|
||||
const SUnit *SuccSU = I->getSUnit();
|
||||
if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
|
||||
unsigned Reg =
|
||||
cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
|
||||
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
|
||||
RetVal = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return RetVal;
|
||||
}
|
||||
|
||||
/// UnitsSharePred - Return true if the two scheduling units share a common
|
||||
/// data predecessor.
|
||||
static bool UnitsSharePred(const SUnit *left, const SUnit *right) {
|
||||
SmallSet<const SUnit*, 4> Preds;
|
||||
for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end();
|
||||
I != E; ++I) {
|
||||
if (I->isCtrl()) continue; // ignore chain preds
|
||||
Preds.insert(I->getSUnit());
|
||||
}
|
||||
for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end();
|
||||
I != E; ++I) {
|
||||
if (I->isCtrl()) continue; // ignore chain preds
|
||||
if (Preds.count(I->getSUnit()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename RRSort>
|
||||
static bool BURRSort(const SUnit *left, const SUnit *right,
|
||||
const RegReductionPriorityQueue<RRSort> *SPQ) {
|
||||
@ -1558,29 +1598,46 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
|
||||
else if (!LHigh && RHigh)
|
||||
return false;
|
||||
else if (!LHigh && !RHigh) {
|
||||
// If the two nodes share an operand and one of them has a single
|
||||
// use that is a live out copy, favor the one that is live out. Otherwise
|
||||
// it will be difficult to eliminate the copy if the instruction is a
|
||||
// loop induction variable update. e.g.
|
||||
// BB:
|
||||
// sub r1, r3, #1
|
||||
// str r0, [r2, r3]
|
||||
// mov r3, r1
|
||||
// cmp
|
||||
// bne BB
|
||||
bool SharePred = UnitsSharePred(left, right);
|
||||
// FIXME: Only adjust if BB is a loop back edge.
|
||||
// FIXME: What's the cost of a copy?
|
||||
int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0;
|
||||
int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0;
|
||||
int LHeight = (int)left->getHeight() - LBonus;
|
||||
int RHeight = (int)right->getHeight() - RBonus;
|
||||
|
||||
// Low register pressure situation, schedule for latency if possible.
|
||||
bool LStall = left->SchedulingPref == Sched::Latency &&
|
||||
SPQ->getCurCycle() < left->getHeight();
|
||||
(int)SPQ->getCurCycle() < LHeight;
|
||||
bool RStall = right->SchedulingPref == Sched::Latency &&
|
||||
SPQ->getCurCycle() < right->getHeight();
|
||||
(int)SPQ->getCurCycle() < RHeight;
|
||||
// If scheduling one of the node will cause a pipeline stall, delay it.
|
||||
// If scheduling either one of the node will cause a pipeline stall, sort
|
||||
// them according to their height.
|
||||
// If neither will cause a pipeline stall, try to reduce register pressure.
|
||||
if (LStall) {
|
||||
if (!RStall)
|
||||
return true;
|
||||
if (left->getHeight() != right->getHeight())
|
||||
return left->getHeight() > right->getHeight();
|
||||
if (LHeight != RHeight)
|
||||
return LHeight > RHeight;
|
||||
} else if (RStall)
|
||||
return false;
|
||||
|
||||
// If either node is scheduling for latency, sort them by height and latency
|
||||
// first.
|
||||
// If either node is scheduling for latency, sort them by height
|
||||
// and latency.
|
||||
if (left->SchedulingPref == Sched::Latency ||
|
||||
right->SchedulingPref == Sched::Latency) {
|
||||
if (left->getHeight() != right->getHeight())
|
||||
return left->getHeight() > right->getHeight();
|
||||
if (LHeight != RHeight)
|
||||
return LHeight > RHeight;
|
||||
if (left->Latency != right->Latency)
|
||||
return left->Latency > right->Latency;
|
||||
}
|
||||
@ -1631,19 +1688,6 @@ RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// hasCopyToRegUse - Return true if SU has a value successor that is a
|
||||
/// CopyToReg node.
|
||||
static bool hasCopyToRegUse(const SUnit *SU) {
|
||||
for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
|
||||
I != E; ++I) {
|
||||
if (I->isCtrl()) continue;
|
||||
const SUnit *SuccSU = I->getSUnit();
|
||||
if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
|
||||
/// physical register defs.
|
||||
static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
|
||||
@ -1813,6 +1857,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
|
||||
if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode())
|
||||
continue;
|
||||
|
||||
bool isLiveOut = hasOnlyLiveOutUses(SU);
|
||||
unsigned Opc = Node->getMachineOpcode();
|
||||
const TargetInstrDesc &TID = TII->get(Opc);
|
||||
unsigned NumRes = TID.getNumDefs();
|
||||
@ -1862,7 +1907,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
|
||||
SuccOpc == TargetOpcode::SUBREG_TO_REG)
|
||||
continue;
|
||||
if ((!canClobber(SuccSU, DUSU) ||
|
||||
(hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
|
||||
(isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
|
||||
(!SU->isCommutable && SuccSU->isCommutable)) &&
|
||||
!scheduleDAG->IsReachable(SuccSU, SU)) {
|
||||
DEBUG(dbgs() << " Adding a pseudo-two-addr edge from SU #"
|
||||
|
@ -458,6 +458,15 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
|
||||
// Adjust the use operand index by num of defs.
|
||||
OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
|
||||
int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
|
||||
if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
|
||||
!BB->succ_empty()) {
|
||||
unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
|
||||
if (TargetRegisterInfo::isVirtualRegister(Reg))
|
||||
// This copy is a liveout value. It is likely coalesced, so reduce the
|
||||
// latency so not to penalize the def.
|
||||
// FIXME: need target specific adjustment here?
|
||||
Latency = (Latency > 1) ? Latency - 1 : 1;
|
||||
}
|
||||
if (Latency >= 0)
|
||||
dep.setLatency(Latency);
|
||||
}
|
||||
|
@ -1967,8 +1967,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
||||
if (!ItinData || ItinData->isEmpty())
|
||||
return DefTID.mayLoad() ? 3 : 1;
|
||||
|
||||
if (!UseNode->isMachineOpcode())
|
||||
return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
|
||||
if (!UseNode->isMachineOpcode()) {
|
||||
int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
|
||||
if (Subtarget.isCortexA9())
|
||||
return Latency <= 2 ? 1 : Latency - 1;
|
||||
else
|
||||
return Latency <= 3 ? 1 : Latency - 2;
|
||||
}
|
||||
|
||||
const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode());
|
||||
const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
|
||||
|
@ -23,7 +23,10 @@ entry:
|
||||
%4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
|
||||
; Constant pool load followed by add.
|
||||
; Then clobber the loaded register, not the sum.
|
||||
; CHECK: vldr.64
|
||||
; CHECK: vadd.f64
|
||||
; CHECK: vldr.64 [[LDR:d.*]],
|
||||
; CHECK: LPC0_0:
|
||||
; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
|
||||
; CHECK: vmov.f64 [[LDR]]
|
||||
%5 = fadd <2 x double> %3, %3 ; <<2 x double>> [#uses=2]
|
||||
|
@ -15,9 +15,9 @@ bb.nph: ; preds = %bb5
|
||||
|
||||
; Loop preheader
|
||||
; CHECK: vmov.f32
|
||||
; CHECK: vmul.f32
|
||||
; CHECK: vsub.f32
|
||||
; CHECK: vadd.f32
|
||||
; CHECK: vmul.f32
|
||||
bb7: ; preds = %bb9, %bb.nph
|
||||
%s1.02 = phi float [ undef, %bb.nph ], [ %35, %bb9 ] ; <float> [#uses=3]
|
||||
%tmp79 = add i32 undef, undef ; <i32> [#uses=1]
|
||||
|
Loading…
x
Reference in New Issue
Block a user