Break false dependencies before partial register updates.

Two new TargetInstrInfo hooks lets the target tell ExecutionDepsFix
about instructions with partial register updates causing false unwanted
dependencies.

The ExecutionDepsFix pass will break the false dependencies if the
updated register was written in the previoius N instructions.

The small loop added to sse-domains.ll runs twice as fast with
dependency-breaking instructions inserted.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144602 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jakob Stoklund Olesen 2011-11-15 01:15:30 +00:00
parent 2947f730a9
commit c2ecf3efbf
5 changed files with 193 additions and 0 deletions

View File

@ -718,6 +718,74 @@ public:
///
virtual void setExecutionDomain(MachineInstr *MI, unsigned Domain) const {}
/// getPartialRegUpdateClearance - Returns the preferred minimum clearance
/// before an instruction with an unwanted partial register update.
///
/// Some instructions only write part of a register, and implicitly need to
/// read the other parts of the register. This may cause unwanted stalls
/// preventing otherwise unrelated instructions from executing in parallel in
/// an out-of-order CPU.
///
/// For example, the x86 instruction cvtsi2ss writes its result to bits
/// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
/// the instruction needs to wait for the old value of the register to become
/// available:
///
/// addps %xmm1, %xmm0
/// movaps %xmm0, (%rax)
/// cvtsi2ss %rbx, %xmm0
///
/// In the code above, the cvtsi2ss instruction needs to wait for the addps
/// instruction before it can issue, even though the high bits of %xmm0
/// probably aren't needed.
///
/// This hook returns the preferred clearance before MI, measured in
/// instructions. Other defs of MI's operand OpNum are avoided in the last N
/// instructions before MI. It should only return a positive value for
/// unwanted dependencies. If the old bits of the defined register have
/// useful values, or if MI is determined to otherwise read the dependency,
/// the hook should return 0.
///
/// The unwanted dependency may be handled by:
///
/// 1. Allocating the same register for an MI def and use. That makes the
/// unwanted dependency identical to a required dependency.
///
/// 2. Allocating a register for the def that has no defs in the previous N
/// instructions.
///
/// 3. Calling breakPartialRegDependency() with the same arguments. This
/// allows the target to insert a dependency breaking instruction.
///
virtual unsigned
getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
// The default implementation returns 0 for no partial register dependency.
return 0;
}
/// breakPartialRegDependency - Insert a dependency-breaking instruction
/// before MI to eliminate an unwanted dependency on OpNum.
///
/// If it wasn't possible to avoid a def in the last N instructions before MI
/// (see getPartialRegUpdateClearance), this hook will be called to break the
/// unwanted dependency.
///
/// On x86, an xorps instruction can be used as a dependency breaker:
///
/// addps %xmm1, %xmm0
/// movaps %xmm0, (%rax)
/// xorps %xmm0, %xmm0
/// cvtsi2ss %rbx, %xmm0
///
/// An <imp-kill> operand should be added to MI if an instruction was
/// inserted. This ties the instructions together in the post-ra scheduler.
///
virtual void
breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {}
private:
int CallFrameSetupOpcode, CallFrameDestroyOpcode;
};

View File

@ -471,11 +471,34 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
<< '\t' << *MI);
// How many instructions since rx was last written?
unsigned Clearance = CurInstr - LiveRegs[rx].Def;
LiveRegs[rx].Def = CurInstr;
// Kill off domains redefined by generic instructions.
if (Kill)
kill(rx);
// Verify clearance before partial register updates.
unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
if (!Pref)
continue;
DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
if (Pref > Clearance) {
DEBUG(dbgs() << ": Break dependency.\n");
TII->breakPartialRegDependency(MI, i, TRI);
continue;
}
// The current clearance seems OK, but we may be ignoring a def from a
// back-edge.
if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
DEBUG(dbgs() << ": OK.\n");
continue;
}
// A def from an unprocessed back-edge may make us break this dependency.
DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
}
++CurInstr;
@ -663,6 +686,10 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
for (unsigned i = 0, e = Loops.size(); i != e; ++i) {
MachineBasicBlock *MBB = Loops[i];
enterBasicBlock(MBB);
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
++I)
if (!I->isDebugValue())
processDefs(I, false);
leaveBasicBlock(MBB);
}

View File

@ -2761,6 +2761,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
///
static bool hasPartialRegUpdate(unsigned Opcode) {
switch (Opcode) {
case X86::CVTSI2SSrr:
case X86::CVTSI2SS64rr:
case X86::CVTSI2SDrr:
case X86::CVTSI2SD64rr:
case X86::CVTSD2SSrr:
case X86::Int_CVTSD2SSrr:
case X86::CVTSS2SDrr:
@ -2789,6 +2793,54 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
return false;
}
/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
/// instructions we would like before a partial register update.
unsigned X86InstrInfo::
getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
return 0;
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI->getOperand(0);
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
if (MO.readsReg() || MI->readsVirtualRegister(Reg))
return 0;
} else {
if (MI->readsRegister(Reg, TRI))
return 0;
}
// If any of the preceding 16 instructions are reading Reg, insert a
// dependency breaking instruction. The magic number is based on a few
// Nehalem experiments.
return 16;
}
void X86InstrInfo::
breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
unsigned Reg = MI->getOperand(OpNum).getReg();
if (X86::VR128RegClass.contains(Reg)) {
// These instructions are all floating point domain, so xorps is the best
// choice.
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
} else if (X86::VR256RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.
unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
} else
return;
MI->addRegisterKilled(Reg, TRI, true);
}
MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,

View File

@ -345,6 +345,11 @@ public:
void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const;
void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const;
MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr* MI,
unsigned OpNum,

View File

@ -43,3 +43,44 @@ while.body:
while.end:
ret void
}
; CHECK: f2
;
; This loop contains two cvtsi2ss instructions that update the same xmm
; register. Verify that the execution dependency fix pass breaks those
; dependencies by inserting xorps instructions.
;
; If the register allocator chooses different registers for the two cvtsi2ss
; instructions, they are still dependent on themselves.
; CHECK: xorps [[XMM1:%xmm[0-9]+]]
; CHECK: , [[XMM1]]
; CHECK: cvtsi2ss %{{.*}}, [[XMM1]]
; CHECK: xorps [[XMM2:%xmm[0-9]+]]
; CHECK: , [[XMM2]]
; CHECK: cvtsi2ss %{{.*}}, [[XMM2]]
;
define float @f2(i32 %m) nounwind uwtable readnone ssp {
entry:
%tobool3 = icmp eq i32 %m, 0
br i1 %tobool3, label %for.end, label %for.body
for.body: ; preds = %entry, %for.body
%m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
%s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
%s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
%n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
%conv = sitofp i32 %n.04 to float
%add = fadd float %s1.06, %conv
%conv1 = sitofp i32 %m.addr.07 to float
%add2 = fadd float %s2.05, %conv1
%inc = add nsw i32 %n.04, 1
%dec = add nsw i32 %m.addr.07, -1
%tobool = icmp eq i32 %dec, 0
br i1 %tobool, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
%s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
%s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
%sub = fsub float %s1.0.lcssa, %s2.0.lcssa
ret float %sub
}