Replace vfmaddxx213 instructions with their 231-type equivalents in accumulator

loops. Writing back to the accumulator (231-type) allows the coalescer to
eliminate an extra copy.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199933 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Lang Hames 2014-01-23 20:23:36 +00:00
parent b961ae25b4
commit d8f4348cab
4 changed files with 122 additions and 3 deletions

View File

@ -15963,6 +15963,81 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
return MBB;
}
// Replace 213-type (isel default) FMA3 instructions with 231-type for
// accumulator loops. Writing back to the accumulator allows the coalescer
// to remove extra copies in the loop.
MachineBasicBlock *
X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
MachineBasicBlock *MBB) const {
MachineOperand &AddendOp = MI->getOperand(3);
// Bail out early if the addend isn't a register - we can't switch these.
if (!AddendOp.isReg())
return MBB;
MachineFunction &MF = *MBB->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
// Check whether the addend is defined by a PHI:
assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
MachineInstr &AddendDef = *MRI.def_begin(AddendOp.getReg());
if (!AddendDef.isPHI())
return MBB;
// Look for the following pattern:
// loop:
// %addend = phi [%entry, 0], [%loop, %result]
// ...
// %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
// Replace with:
// loop:
// %addend = phi [%entry, 0], [%loop, %result]
// ...
// %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
assert(AddendDef.getOperand(i).isReg());
MachineOperand PHISrcOp = AddendDef.getOperand(i);
MachineInstr &PHISrcInst = *MRI.def_begin(PHISrcOp.getReg());
if (&PHISrcInst == MI) {
// Found a matching instruction.
unsigned NewFMAOpc = 0;
switch (MI->getOpcode()) {
case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
default: llvm_unreachable("Unrecognized FMA variant.");
}
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(3))
.addOperand(MI->getOperand(2))
.addOperand(MI->getOperand(1));
MBB->insert(MachineBasicBlock::iterator(MI), MIB);
MI->eraseFromParent();
}
}
return MBB;
}
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
@ -16194,6 +16269,32 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
case X86::VFMADDPDr213r:
case X86::VFMADDPSr213r:
case X86::VFMADDSDr213r:
case X86::VFMADDSSr213r:
case X86::VFMSUBPDr213r:
case X86::VFMSUBPSr213r:
case X86::VFMSUBSDr213r:
case X86::VFMSUBSSr213r:
case X86::VFNMADDPDr213r:
case X86::VFNMADDPSr213r:
case X86::VFNMADDSDr213r:
case X86::VFNMADDSSr213r:
case X86::VFNMSUBPDr213r:
case X86::VFNMSUBPSr213r:
case X86::VFNMSUBSDr213r:
case X86::VFNMSUBSSr213r:
case X86::VFMADDPDr213rY:
case X86::VFMADDPSr213rY:
case X86::VFMSUBPDr213rY:
case X86::VFMSUBPSr213rY:
case X86::VFNMADDPDr213rY:
case X86::VFNMADDPSr213rY:
case X86::VFNMSUBPDr213rY:
case X86::VFNMSUBPSr213rY:
return emitFMA3Instr(MI, BB);
}
}

View File

@ -972,6 +972,9 @@ namespace llvm {
MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const;
MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
MachineBasicBlock *MBB) const;
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent, for use with the given x86 condition code.
SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;

View File

@ -20,7 +20,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
PatFrag MemFrag128, PatFrag MemFrag256,
ValueType OpVT128, ValueType OpVT256,
SDPatternOperator Op = null_frag> {
let isCommutable = 1 in
let isCommutable = 1, usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@ -36,7 +36,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
(MemFrag128 addr:$src3))))]>;
let isCommutable = 1 in
let isCommutable = 1, usesCustomInserter = 1 in
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
@ -118,7 +118,7 @@ let Constraints = "$src1 = $dst" in {
multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
SDPatternOperator OpNode = null_frag> {
let isCommutable = 1 in
let isCommutable = 1, usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,

View File

@ -42,6 +42,21 @@ entry:
ret float %call
}
; Test FMA3 variant selection
; CHECK: fma3_select231:
; CHECK: vfmadd231ss
define float @fma3_select231(float %x, float %y, i32 %N) #0 {
entry:
br label %while.body
while.body: ; preds = %while.body, %while.body
%acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ]
%acc = tail call float @llvm.fma.f32(float %x, float %y, float %acc.01) nounwind readnone
%b = fcmp ueq float %acc, 0.0
br i1 %b, label %while.body, label %while.end
while.end: ; preds = %while.body, %entry
ret float %acc
}
declare float @llvm.fma.f32(float, float, float) nounwind readnone
declare double @llvm.fma.f64(double, double, double) nounwind readnone
declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone