[X86] Memory folding for commutative instructions (updated)

This patch improves support for commutative instructions in the x86 memory folding implementation by attempting to fold a commuted version of the instruction if the original folding fails - if that folding fails as well the instruction is 're-commuted' back to its original order before returning.

Updated version of r219584 (reverted in r219595) - the commutation attempt now explicitly ensures that neither of the commuted source operands are tied to the destination operand / register, which was the source of all the regressions that occurred with the original patch attempt.

Added additional regression test case provided by Joerg Sonnenberger.

Differential Revision: http://reviews.llvm.org/D5818



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220239 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Simon Pilgrim 2014-10-20 22:14:22 +00:00
parent b9c7f60191
commit 0d1978b813
5 changed files with 202 additions and 59 deletions

View File

@ -3337,7 +3337,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
AM.getFullAddress(AddrOps);
MachineInstr *Result =
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
Size, Alignment, /*AllowCommute=*/true);
if (!Result)
return false;

View File

@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
/// operand at the use. We fold the load instructions if load defines a virtual
/// register, the virtual register is used once in the same BB, and the
/// instructions in-between do not load or store, and have no side effects.
MachineInstr* X86InstrInfo::
optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
if (FoldAsLoadDefReg == 0)
return nullptr;
// To be conservative, if there exists another load, clear the load candidate.
@ -3926,55 +3926,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
if (!DefMI->isSafeToMove(this, nullptr, SawStore))
return nullptr;
// We try to commute MI if possible.
unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
// Collect information about virtual register operands of MI.
unsigned SrcOperandId = 0;
bool FoundSrcOperand = false;
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def or multiple uses.
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
return nullptr;
SrcOperandId = i;
FoundSrcOperand = true;
}
if (!FoundSrcOperand) return nullptr;
// Check whether we can fold the def into SrcOperandId.
SmallVector<unsigned, 8> Ops;
Ops.push_back(SrcOperandId);
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
if (FoldMI) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
if (Idx == 1) {
// MI was changed but it didn't help, commute it back!
commuteInstruction(MI, false);
// Collect information about virtual register operands of MI.
unsigned SrcOperandId = 0;
bool FoundSrcOperand = false;
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def or multiple uses.
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
return nullptr;
}
// Check whether we can commute MI and enable folding.
if (MI->isCommutable()) {
MachineInstr *NewMI = commuteInstruction(MI, false);
// Unable to commute.
if (!NewMI) return nullptr;
if (NewMI != MI) {
// New instruction. It doesn't need to be kept.
NewMI->eraseFromParent();
return nullptr;
}
}
SrcOperandId = i;
FoundSrcOperand = true;
}
if (!FoundSrcOperand)
return nullptr;
// Check whether we can fold the def into SrcOperandId.
SmallVector<unsigned, 8> Ops;
Ops.push_back(SrcOperandId);
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
if (FoldMI) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
return nullptr;
}
@ -4134,7 +4114,8 @@ MachineInstr*
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI, unsigned i,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Align) const {
unsigned Size, unsigned Align,
bool AllowCommute) const {
const DenseMap<unsigned,
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
bool isCallRegIndirect = Subtarget.callRegIndirect();
@ -4202,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
return nullptr;
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is due
// to liveintervalanalysis remat'ing a load from stack slot.
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
return nullptr;
Opcode = X86::MOV32rm;
@ -4222,8 +4203,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// to a 32-bit one.
unsigned DstReg = NewMI->getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
X86::sub_32bit));
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
}
@ -4231,6 +4211,65 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
}
}
// If the instruction and target operand are commutable, commute the
// instruction and try again.
if (AllowCommute) {
unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI->getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
bool Tied0 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied1 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
// If either of the commutable operands are tied to the destination
// then we can not commute + fold.
if ((HasDef && Reg0 == Reg1 && Tied0) ||
(HasDef && Reg0 == Reg2 && Tied1))
return nullptr;
if ((CommuteOpIdx1 == OriginalOpIdx) ||
(CommuteOpIdx2 == OriginalOpIdx)) {
MachineInstr *CommutedMI = commuteInstruction(MI, false);
if (!CommutedMI) {
// Unable to commute.
return nullptr;
}
if (CommutedMI != MI) {
// New instruction. We can't fold from this.
CommutedMI->eraseFromParent();
return nullptr;
}
// Attempt to fold with the commuted version of the instruction.
unsigned CommuteOp =
(CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
/*AllowCommute=*/false);
if (NewMI)
return NewMI;
// Folding failed again - undo the commute before returning.
MachineInstr *UncommutedMI = commuteInstruction(MI, false);
if (!UncommutedMI) {
// Unable to commute.
return nullptr;
}
if (UncommutedMI != MI) {
// New instruction. It doesn't need to be kept.
UncommutedMI->eraseFromParent();
return nullptr;
}
// Return here to prevent duplicate fuse failure report.
return nullptr;
}
}
}
// No fusion
if (PrintFailedFusing && !MI->isCopy())
dbgs() << "We failed to fuse operand " << i << " in " << *MI;
@ -4440,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
SmallVector<MachineOperand,4> MOs;
MOs.push_back(MachineOperand::CreateFI(FrameIndex));
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
Size, Alignment, /*AllowCommute=*/true);
}
static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
@ -4593,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
break;
}
}
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}

View File

@ -404,7 +404,8 @@ public:
MachineInstr* MI,
unsigned OpNum,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Alignment) const;
unsigned Size, unsigned Alignment,
bool AllowCommute) const;
void
getUnconditionalBranch(MCInst &Branch,

View File

@ -0,0 +1,16 @@
; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
; Function Attrs: nounwind readonly uwtable
define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 {
%1 = fadd <32 x double> %a, %b
%2 = fsub <32 x double> %a, %b
%3 = fmul <32 x double> %1, %2
ret <32 x double> %3
;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
}

View File

@ -0,0 +1,84 @@
; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
; Regression test for http://reviews.llvm.org/D5701
; ModuleID = 'xxhash.i'
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"
; CHECK-LABEL: fn1
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: retl
%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
@a = common global i32 0, align 4
@b = common global i64 0, align 8
; Function Attrs: nounwind uwtable
define i64 @fn1() #0 {
entry:
%0 = load i32* @a, align 4, !tbaa !1
%1 = inttoptr i32 %0 to %struct.XXH_state64_t*
%total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
%2 = load i32* %total_len, align 4, !tbaa !5
%tobool = icmp eq i32 %2, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %entry
%v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
%3 = load i64* %v3, align 4, !tbaa !8
%v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
%4 = load i64* %v4, align 4, !tbaa !9
%v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
%5 = load i64* %v2, align 4, !tbaa !10
%shl = shl i64 %5, 1
%or = or i64 %shl, %5
%shl2 = shl i64 %3, 2
%shr = lshr i64 %3, 1
%or3 = or i64 %shl2, %shr
%add = add i64 %or, %or3
%mul = mul i64 %4, -4417276706812531889
%shl4 = mul i64 %4, -8834553413625063778
%shr5 = ashr i64 %mul, 3
%or6 = or i64 %shr5, %shl4
%mul7 = mul nsw i64 %or6, 1400714785074694791
%xor = xor i64 %add, %mul7
store i64 %xor, i64* @b, align 8, !tbaa !11
%mul8 = mul nsw i64 %xor, 1400714785074694791
br label %if.end
if.else: ; preds = %entry
%6 = load i64* @b, align 8, !tbaa !11
%xor10 = xor i64 %6, -4417276706812531889
%mul11 = mul nsw i64 %xor10, 400714785074694791
br label %if.end
if.end: ; preds = %if.else, %if.then
%storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
%storemerge = add i64 %storemerge.in, -8796714831421723037
store i64 %storemerge, i64* @b, align 8, !tbaa !11
ret i64 undef
}
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"int", metadata !3, i64 0}
!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
!4 = metadata !{metadata !"Simple C/C++ TBAA"}
!5 = metadata !{metadata !6, metadata !2, i64 0}
!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
!7 = metadata !{metadata !"long long", metadata !3, i64 0}
!8 = metadata !{metadata !6, metadata !7, i64 16}
!9 = metadata !{metadata !6, metadata !7, i64 24}
!10 = metadata !{metadata !6, metadata !7, i64 8}
!11 = metadata !{metadata !7, metadata !7, i64 0}